paper for joss (#312)

solegalli · web-flow · commit f7fae39b66e0 · 2021-09-29T11:09:24.000+02:00
* add papers first draft

* first draft paper

* add @ to inline citations

* final edits to paper

* update format inline citations

* reformats zenodo bibex entry

* fixes spelling  and punctuation

* adds missing DOIs

* updates zenodo citation

* removes doi.org from citations

* removes zenodo citation
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,260 @@
+@article{sklearn,
+ title        = {Scikit-learn: Machine Learning in {P}ython},
+ author       = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+ and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+ and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+ Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal      = {Journal of Machine Learning Research},
+ volume       = {12},
+ pages        = {2825--2830},
+ year         = {2011}
+}
+
+
+@InProceedings{pandas,
+  author    = { {W}es {M}c{K}inney },
+  title     = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython },
+  booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference },
+  pages     = { 56 - 61 },
+  year      = { 2010 },
+  editor    = { {S}t\'efan van der {W}alt and {J}arrod {M}illman },
+  doi       = { 10.25080/Majora-92bf1922-00a }
+}
+
+
+@InProceedings{niculescu09_kdd,
+  title = {Winning the KDD Cup Orange Challenge with Ensemble Selection},
+  author = {Alexandru Niculescu-Mizil and Claudia Perlich and Grzegorz Swirszcz
+  and Vikas Sindhwani and Yan Liu and Prem Melville and Dong Wang and Jing Xiao
+  and Jianying Hu and Moninder Singh and Wei Xiong Shang and Yan Feng Zhu},
+  booktitle = {Proceedings of KDD-Cup 2009 Competition},
+  pages = {23--34},
+  year = {2009},
+  editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire
+  and David Vogel},
+  volume = {7},
+  series = {Proceedings of Machine Learning Research},
+  address = {New York, New York, USA},
+  month = {28 Jun},
+  publisher = {PMLR},
+  pdf = {http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf},
+  url = {http://proceedings.mlr.press/v7/niculescu09.html},
+  abstract = {We describe our wining solution for the KDD Cup Orange Challenge.}
+}
+
+
+@article{micci_mean_encoder,
+  author = {Micci-Barreca, Daniele},
+  title = {A Preprocessing Scheme for High-Cardinality Categorical Attributes in
+  Classification and Prediction Problems},
+  year = {2001},
+  issue_date = {July 2001},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  volume = {3},
+  number = {1},
+  issn = {1931-0145},
+  url = {https://doi.org/10.1145/507533.507538},
+  doi = {10.1145/507533.507538},
+  abstract = {Categorical data fields characterized by a large number of distinct values represent
+  a serious challenge for many classification and regression algorithms that require
+  numerical inputs. On the other hand, these types of data fields are quite common in
+  real-world data mining applications and often contain potentially relevant information
+  that is difficult to represent for modeling purposes.This paper presents a simple
+  preprocessing scheme for high-cardinality categorical data that allows this class
+  of attributes to be used in predictive models such as neural networks, linear and
+  logistic regression. The proposed method is based on a well-established statistical
+  method (empirical Bayes) that is straightforward to implement as an in-database procedure.
+  Furthermore, for categorical attributes with an inherent hierarchical structure, like
+  ZIP codes, the preprocessing scheme can directly leverage the hierarchy by blending
+  statistics at the various levels of aggregation.While the statistical methods discussed
+  in this paper were first introduced in the mid 1950's, the use of these methods as
+  a preprocessing step for complex models, like neural networks, has not been previously
+  discussed in any literature.},
+  journal = {SIGKDD Explor. Newsl.},
+  month = jul,
+  pages = {27–32},
+  numpages = {6},
+  keywords = {categorical attributes, neural networks, predictive models, hierarchical
+  attributes, empirical bayes}
+}
+
+
+@article{boxcox,
+  author = {Box, G. E. P. and Cox, D. R.},
+  title = {An Analysis of Transformations},
+  journal = {Journal of the Royal Statistical Society: Series B (Methodological)},
+  volume = {26},
+  number = {2},
+  pages = {211-243},
+  doi = {10.1111/j.2517-6161.1964.tb00553.x},
+  url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1964.tb00553.x},
+  eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.2517-6161.1964.tb00553.x},
+  abstract = {Summary In the analysis of data it is often assumed that observations
+  y1, y2, …, yn are independently normally distributed with constant variance and
+  with expectations specified by a model linear in a set of parameters θ. In this
+  paper we make the less restrictive assumption that such a normal, homoscedastic,
+  linear model is appropriate after some suitable transformation has been applied
+  to the y's. Inferences about the transformation and about the parameters of the
+  linear model are made by computing the likelihood function and the relevant
+  posterior distribution. The contributions of normality, homoscedasticity and
+  additivity to the transformation are separated. The relation of the present
+  methods to earlier procedures for finding transformations is discussed. The
+  methods are illustrated with examples.},
+  year = {1964}
+}
+
+
+@article{yeojohnson,
+  author = {Yeo, In‐Kwon and Johnson, Richard A.},
+  title = "{A new family of power transformations to improve normality or symmetry}",
+  journal = {Biometrika},
+  volume = {87},
+  number = {4},
+  pages = {954-959},
+  year = {2000},
+  month = {12},
+  abstract = "{We introduce a new power transformation family which is well defined on
+  the whole real line and which is appropriate for reducing skewness and to approximate
+  normality. It has properties similar to those of the Box–Cox transformation for positive
+  variables. The large‐sample properties of the transformation are investigated in the
+  context of a single random sample.}",
+  issn = {0006-3444},
+  doi = {10.1093/biomet/87.4.954},
+  url = {https://doi.org/10.1093/biomet/87.4.954},
+  eprint = {https://academic.oup.com/biomet/article-pdf/87/4/954/633221/870954.pdf},
+}
+
+
+@article{data_prep,
+author = {Kotsiantis, Sotiris and Kanellopoulos, Dimitris and Pintelas, P.},
+year = {2006},
+month = {01},
+pages = {111-117},
+title = {Data Preprocessing for Supervised Learning},
+volume = {1},
+journal = {International Journal of Computer Science}
+}
+
+
+@misc{beatingkaggle,
+  author = {Ying Dong},
+  title = {Beating {Kaggle} the easy way},
+  year = {2015},
+  publisher = {Technische Universität Darmstadt},
+  journal = {Studienarbeit},
+  pdf = {https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf}
+}
+
+
+@article{domingos,
+  author = {Domingos, Pedro},
+  title = {A Few Useful Things to Know about Machine Learning},
+  year = {2012},
+  issue_date = {October 2012},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  volume = {55},
+  number = {10},
+  issn = {0001-0782},
+  url = {https://doi.org/10.1145/2347736.2347755},
+  doi = {10.1145/2347736.2347755},
+  abstract = {Tapping into the "folk knowledge" needed to advance machine
+  learning applications.},
+  journal = {Commun. ACM},
+  month = oct,
+  pages = {78–87},
+  numpages = {10}
+}
+
+
+@inproceedings{kdd_2009_competition,
+  title={The 2009 Knowledge Discovery and Data Mining Competition
+  (KDD Cup 2009): Challenges in Machine Learning},
+  author={G. Dror and M. Boull{\'e} and I. Guyon},
+  year={2011},
+  publisher = {Microtome Publishing  },
+  pdf = {http://www.mtome.com/Publications/CiML/CiML-v3-book.pdf},
+}
+
+
+@InProceedings{miller09_kdd,
+  title = {Predicting customer behaviour: The University of Melbourne's KDD Cup report},
+  author = {Hugh Miller and Sandy Clarke and Stephen Lane and Andrew Lonie and
+  David Lazaridis and Slave Petrovski and Owen Jones},
+  booktitle = {Proceedings of KDD-Cup 2009 Competition},
+  pages = {45--55},
+  year = {2009},
+  editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire
+  and David Vogel},
+  volume = {7},
+  series = {Proceedings of Machine Learning Research},
+  address = {New York, New York, USA},
+   month = {28 Jun},
+   publisher = {PMLR},
+   pdf = {http://proceedings.mlr.press/v7/miller09/miller09.pdf},
+   url = {http://proceedings.mlr.press/v7/miller09.html},
+   abstract = {We discuss the challenges of the 2009 KDD Cup along with our ideas
+   and methodologies for modelling the problem. The main stages included aggressive
+   nonparametric feature selection, careful treatment of categorical variables and
+   tuning a gradient boosting machine under Bernoulli loss with trees.}
+}
+
+
+@inproceedings{kanter2015deep,
+  author    = {James Max Kanter and Kalyan Veeramachaneni},
+  title     = {Deep feature synthesis: Towards automating data science endeavors},
+  booktitle = {2015 {IEEE} International Conference on Data Science and Advanced
+  Analytics, DSAA 2015, Paris, France, October 19-21, 2015},
+  pages     = {1--10},
+  year      = {2015},
+  organization={IEEE},
+   doi={10.1109/DSAA.2015.7344858},
+}
+
+
+@article{christ_tsfresh,
+  title = {Time Series FeatuRe Extraction on basis of Scalable Hypothesis tests
+  (tsfresh – A Python package)},
+  journal = {Neurocomputing},
+  volume = {307},
+  pages = {72-77},
+  year = {2018},
+  issn = {0925-2312},
+  doi = {10.1016/j.neucom.2018.03.067},
+  url = {https://www.sciencedirect.com/science/article/pii/S0925231218304843},
+  author = {Maximilian Christ and Nils Braun and Julius Neuffer and
+  Andreas W. Kempa-Liehr},
+  keywords = {Feature engineering, Time series, Feature extraction, Feature
+  selection, Machine learning},
+  abstract = {Time series feature engineering is a time-consuming process because
+  scientists and engineers have to consider the multifarious algorithms of signal
+  processing and time series analysis for identifying and extracting meaningful
+  features from time series. The Python package tsfresh (Time Series FeatuRe
+  Extraction on basis of Scalable Hypothesis tests) accelerates this process
+  by combining 63 time series characterization methods, which by default compute
+  a total of 794 time series features, with feature selection on basis automatically
+  configured hypothesis tests. By identifying statistically significant time series
+  characteristics in an early stage of the data science process, tsfresh closes
+  feedback loops with domain experts and fosters the development of domain
+  specific features early on. The package implements standard APIs of time
+  series and machine learning libraries (e.g. pandas and scikit-learn) and is
+  designed for both exploratory analyses as well as straightforward integration
+  into operational data science applications.}
+}
+
+
+@article{category_encoders,
+  doi = {10.21105/joss.00501},
+  url = {https://doi.org/10.21105/joss.00501},
+  year = {2018},
+  publisher = {The Open Journal},
+  volume = {3},
+  number = {21},
+  pages = {501},
+  author = {William D. McGinnis and Chapman Siu and Andre S and Hanyu Huang},
+  title = {Category Encoders: a scikit-learn-contrib package of transformers
+  for encoding categorical data},
+  journal = {Journal of Open Source Software}
+}
+
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,104 @@
+---
+title: 'Feature-engine: A Python package for feature engineering for machine learning'
+tags:
+  - python
+  - feature engineering
+  - feature selection
+  - machine learning
+  - data science
+authors:
+  - name: Soledad Galli
+    affiliation: 1
+affiliations:
+ - name: Train in Data
+   index: 1
+date: 6 August 2021
+bibliography: paper.bib
+---
+
+# Summary
+
+Feature-engine is an open source Python library with the most exhaustive battery of 
+transformations to engineer and select features for use in machine learning. Feature-engine 
+supports several techniques to impute missing data, encode categorical variables, transform 
+variables mathematically, perform discretization, remove or censor outliers, and combine 
+variables into new features. Feature-engine also hosts an array of algorithms for feature 
+selection.
+
+The primary goal of Feature-engine is to make commonly used data transformation procedures 
+accessible to researchers and data scientists, focusing on creating user-friendly and 
+intuitive classes, compatible with existing machine learning libraries, like Scikit-learn 
+[@sklearn] and Pandas [@pandas].
+
+Many feature transformation techniques learn parameters from data, like the values for 
+imputation or the mappings for encoding. Feature-engine classes learn these parameters 
+from the data and store them in their attributes to transform future data. Feature-engine’s 
+transformers preserve Scikit-learn’s functionality with the methods fit() and transform() 
+to learn parameters from and then transform data. Feature-engine's transformers can be 
+incorporated into a Scikit-learn Pipeline to streamline data transformation and facilitate 
+model deployment, by allowing the serialization of the entire pipeline in one pickle.
+
+When pre-processing a dataset different feature transformations are applied to different 
+variable groups. Feature-engine classes allow the user to select which variables to transform 
+within each class, therefore, while taking the entire dataframe as input, only the indicated 
+variables are modified. Data pre-processing and feature engineering are commonly done 
+together with data exploration. Feature-engine transformers return dataframes as output, 
+thus, users can continue to leverage the power of Pandas for data analysis and visualization 
+after transforming the data set.
+
+In summary, Feature-engine supports a large variety of commonly used data transformation 
+techniques [@data_prep; @boxcox; @yeojohnson; @kdd_2009_competition; 
+@beatingkaggle; @micci_mean_encoder], as well as techniques that were developed 
+in data science competitions [@niculescu09_kdd], including those for feature selection 
+[@miller09_kdd]. Thus, Feature-engine builds upon and extends the capabilities of 
+Python's current scientific computing stack and makes accessible transformations that 
+are otherwise not easy to find, understand or code, to data scientist and data 
+practitioners.
+
+
+
+# Statement of need
+
+Data scientists spend an enormous amount of time on data pre-processing and transformation 
+ahead of training machine learning models [@domingos]. While some feature engineering 
+processes can be domain-specific, a large variety of transformations are commonly applied 
+across datasets. For example, data scientists need to impute or remove missing values or 
+transform categories into numbers, to train machine learning models using Scikit-learn, 
+the main library for machine learning. Yet, depending on the nature of the variable and 
+the characteristics of the machine learning model, they may need to use different techniques. 
+
+Feature-engine gathers the most frequently used data pre-processing techniques, as well as 
+bespoke techniques developed in data science competitions, in a library, from which users can pick 
+and choose the transformation that they need, and use it just like they would use any other 
+Scikit-learn class. As a result, users are spared of manually creating a lot of code, which 
+is often repetitive, as the same procedures are applied to different datasets. In addition, 
+Feature-engine classes are written to production standards, which ensures classes return 
+the expected result, and maximizes reproducibility between research and production 
+environments through version control.
+
+In the last few years, a number of open source Python libraries that support feature 
+engineering techniques have emerged, highlighting the importance of making feature 
+engineering and creation accessible and, as much as possible, automated. Among these, 
+Featuretools [@kanter2015deep] creates features from temporal and relational datasets, 
+tsfresh [@christ_tsfresh] extracts features from time series, Category encoders 
+[@category_encoders] supports a comprehensive list of methods to encode categorical 
+variables, and Scikit-learn [@sklearn] implements a number of data transformation 
+techniques, with the caveat that the transformations are applied to the entire dataset, 
+and the output are NumPy arrays. Feature-engine extends the capabilities of the current 
+Python’s scientific computing stack by allowing the application of the transformations 
+to subsets of variables in the dataset, returning dataframes for data exploration, and 
+supporting transformations not currently available in other libraries, like those for 
+outlier censoring or removal, besides additional techniques for discretization and 
+feature selection that were developed by data scientist working in the industry or data 
+science competitions.
+
+
+# Acknowledgements
+
+I would like to acknowledge all of the contributors and users of Feature-engine, who helped 
+with valuable feedback, bug fixes, and additional functionality to further improve the library. 
+A special thanks to Christopher Samiullah for continuous support on code quality and 
+architecture. A list of  Feature-engine contributors is available at 
+https://github.com/feature-engine/feature_engine/graphs/contributors.
+
+# References