|
| 1 | +@article{sklearn, |
| 2 | + title = {Scikit-learn: Machine Learning in {P}ython}, |
| 3 | + author = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. |
| 4 | + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. |
| 5 | + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and |
| 6 | + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, |
| 7 | + journal = {Journal of Machine Learning Research}, |
| 8 | + volume = {12}, |
| 9 | + pages = {2825--2830}, |
| 10 | + year = {2011} |
| 11 | +} |
| 12 | + |
| 13 | + |
| 14 | +@InProceedings{pandas, |
| 15 | + author = { {W}es {M}c{K}inney }, |
| 16 | + title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, |
| 17 | + booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, |
| 18 | + pages = { 56 - 61 }, |
| 19 | + year = { 2010 }, |
| 20 | + editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, |
| 21 | + doi = { 10.25080/Majora-92bf1922-00a } |
| 22 | +} |
| 23 | + |
| 24 | + |
| 25 | +@InProceedings{niculescu09_kdd, |
| 26 | + title = {Winning the KDD Cup Orange Challenge with Ensemble Selection}, |
| 27 | + author = {Alexandru Niculescu-Mizil and Claudia Perlich and Grzegorz Swirszcz |
| 28 | + and Vikas Sindhwani and Yan Liu and Prem Melville and Dong Wang and Jing Xiao |
| 29 | + and Jianying Hu and Moninder Singh and Wei Xiong Shang and Yan Feng Zhu}, |
| 30 | + booktitle = {Proceedings of KDD-Cup 2009 Competition}, |
| 31 | + pages = {23--34}, |
| 32 | + year = {2009}, |
| 33 | + editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire |
| 34 | + and David Vogel}, |
| 35 | + volume = {7}, |
| 36 | + series = {Proceedings of Machine Learning Research}, |
| 37 | + address = {New York, New York, USA}, |
| 38 | + month = {28 Jun}, |
| 39 | + publisher = {PMLR}, |
| 40 | + pdf = {http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf}, |
| 41 | + url = {http://proceedings.mlr.press/v7/niculescu09.html}, |
| 42 | + abstract = {We describe our wining solution for the KDD Cup Orange Challenge.} |
| 43 | +} |
| 44 | + |
| 45 | + |
| 46 | +@article{micci_mean_encoder, |
| 47 | + author = {Micci-Barreca, Daniele}, |
| 48 | + title = {A Preprocessing Scheme for High-Cardinality Categorical Attributes in |
| 49 | + Classification and Prediction Problems}, |
| 50 | + year = {2001}, |
| 51 | + issue_date = {July 2001}, |
| 52 | + publisher = {Association for Computing Machinery}, |
| 53 | + address = {New York, NY, USA}, |
| 54 | + volume = {3}, |
| 55 | + number = {1}, |
| 56 | + issn = {1931-0145}, |
| 57 | + url = {https://doi.org/10.1145/507533.507538}, |
| 58 | + doi = {10.1145/507533.507538}, |
| 59 | + abstract = {Categorical data fields characterized by a large number of distinct values represent |
| 60 | + a serious challenge for many classification and regression algorithms that require |
| 61 | + numerical inputs. On the other hand, these types of data fields are quite common in |
| 62 | + real-world data mining applications and often contain potentially relevant information |
| 63 | + that is difficult to represent for modeling purposes.This paper presents a simple |
| 64 | + preprocessing scheme for high-cardinality categorical data that allows this class |
| 65 | + of attributes to be used in predictive models such as neural networks, linear and |
| 66 | + logistic regression. The proposed method is based on a well-established statistical |
| 67 | + method (empirical Bayes) that is straightforward to implement as an in-database procedure. |
| 68 | + Furthermore, for categorical attributes with an inherent hierarchical structure, like |
| 69 | + ZIP codes, the preprocessing scheme can directly leverage the hierarchy by blending |
| 70 | + statistics at the various levels of aggregation.While the statistical methods discussed |
| 71 | + in this paper were first introduced in the mid 1950's, the use of these methods as |
| 72 | + a preprocessing step for complex models, like neural networks, has not been previously |
| 73 | + discussed in any literature.}, |
| 74 | + journal = {SIGKDD Explor. Newsl.}, |
| 75 | + month = jul, |
| 76 | + pages = {27–32}, |
| 77 | + numpages = {6}, |
| 78 | + keywords = {categorical attributes, neural networks, predictive models, hierarchical |
| 79 | + attributes, empirical bayes} |
| 80 | +} |
| 81 | + |
| 82 | + |
| 83 | +@article{boxcox, |
| 84 | + author = {Box, G. E. P. and Cox, D. R.}, |
| 85 | + title = {An Analysis of Transformations}, |
| 86 | + journal = {Journal of the Royal Statistical Society: Series B (Methodological)}, |
| 87 | + volume = {26}, |
| 88 | + number = {2}, |
| 89 | + pages = {211-243}, |
| 90 | + doi = {10.1111/j.2517-6161.1964.tb00553.x}, |
| 91 | + url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1964.tb00553.x}, |
| 92 | + eprint = {https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.2517-6161.1964.tb00553.x}, |
| 93 | + abstract = {Summary In the analysis of data it is often assumed that observations |
| 94 | + y1, y2, …, yn are independently normally distributed with constant variance and |
| 95 | + with expectations specified by a model linear in a set of parameters θ. In this |
| 96 | + paper we make the less restrictive assumption that such a normal, homoscedastic, |
| 97 | + linear model is appropriate after some suitable transformation has been applied |
| 98 | + to the y's. Inferences about the transformation and about the parameters of the |
| 99 | + linear model are made by computing the likelihood function and the relevant |
| 100 | + posterior distribution. The contributions of normality, homoscedasticity and |
| 101 | + additivity to the transformation are separated. The relation of the present |
| 102 | + methods to earlier procedures for finding transformations is discussed. The |
| 103 | + methods are illustrated with examples.}, |
| 104 | + year = {1964} |
| 105 | +} |
| 106 | + |
| 107 | + |
| 108 | +@article{yeojohnson, |
| 109 | + author = {Yeo, In‐Kwon and Johnson, Richard A.}, |
| 110 | + title = "{A new family of power transformations to improve normality or symmetry}", |
| 111 | + journal = {Biometrika}, |
| 112 | + volume = {87}, |
| 113 | + number = {4}, |
| 114 | + pages = {954-959}, |
| 115 | + year = {2000}, |
| 116 | + month = {12}, |
| 117 | + abstract = "{We introduce a new power transformation family which is well defined on |
| 118 | + the whole real line and which is appropriate for reducing skewness and to approximate |
| 119 | + normality. It has properties similar to those of the Box–Cox transformation for positive |
| 120 | + variables. The large‐sample properties of the transformation are investigated in the |
| 121 | + context of a single random sample.}", |
| 122 | + issn = {0006-3444}, |
| 123 | + doi = {10.1093/biomet/87.4.954}, |
| 124 | + url = {https://doi.org/10.1093/biomet/87.4.954}, |
| 125 | + eprint = {https://academic.oup.com/biomet/article-pdf/87/4/954/633221/870954.pdf}, |
| 126 | +} |
| 127 | + |
| 128 | + |
| 129 | +@article{data_prep, |
| 130 | +author = {Kotsiantis, Sotiris and Kanellopoulos, Dimitris and Pintelas, P.}, |
| 131 | +year = {2006}, |
| 132 | +month = {01}, |
| 133 | +pages = {111-117}, |
| 134 | +title = {Data Preprocessing for Supervised Learning}, |
| 135 | +volume = {1}, |
| 136 | +journal = {International Journal of Computer Science} |
| 137 | +} |
| 138 | + |
| 139 | + |
| 140 | +@misc{beatingkaggle, |
| 141 | + author = {Ying Dong}, |
| 142 | + title = {Beating {Kaggle} the easy way}, |
| 143 | + year = {2015}, |
| 144 | + publisher = {Technische Universität Darmstadt}, |
| 145 | + journal = {Studienarbeit}, |
| 146 | + pdf = {https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf} |
| 147 | +} |
| 148 | + |
| 149 | + |
| 150 | +@article{domingos, |
| 151 | + author = {Domingos, Pedro}, |
| 152 | + title = {A Few Useful Things to Know about Machine Learning}, |
| 153 | + year = {2012}, |
| 154 | + issue_date = {October 2012}, |
| 155 | + publisher = {Association for Computing Machinery}, |
| 156 | + address = {New York, NY, USA}, |
| 157 | + volume = {55}, |
| 158 | + number = {10}, |
| 159 | + issn = {0001-0782}, |
| 160 | + url = {https://doi.org/10.1145/2347736.2347755}, |
| 161 | + doi = {10.1145/2347736.2347755}, |
| 162 | + abstract = {Tapping into the "folk knowledge" needed to advance machine |
| 163 | + learning applications.}, |
| 164 | + journal = {Commun. ACM}, |
| 165 | + month = oct, |
| 166 | + pages = {78–87}, |
| 167 | + numpages = {10} |
| 168 | +} |
| 169 | + |
| 170 | + |
| 171 | +@inproceedings{kdd_2009_competition, |
| 172 | + title={The 2009 Knowledge Discovery and Data Mining Competition |
| 173 | + (KDD Cup 2009): Challenges in Machine Learning}, |
| 174 | + author={G. Dror and M. Boull{\'e} and I. Guyon}, |
| 175 | + year={2011}, |
| 176 | + publisher = {Microtome Publishing }, |
| 177 | + pdf = {http://www.mtome.com/Publications/CiML/CiML-v3-book.pdf}, |
| 178 | +} |
| 179 | + |
| 180 | + |
| 181 | +@InProceedings{miller09_kdd, |
| 182 | + title = {Predicting customer behaviour: The University of Melbourne's KDD Cup report}, |
| 183 | + author = {Hugh Miller and Sandy Clarke and Stephen Lane and Andrew Lonie and |
| 184 | + David Lazaridis and Slave Petrovski and Owen Jones}, |
| 185 | + booktitle = {Proceedings of KDD-Cup 2009 Competition}, |
| 186 | + pages = {45--55}, |
| 187 | + year = {2009}, |
| 188 | + editor = {Gideon Dror and Mar Boullé and Isabelle Guyon and Vincent Lemaire |
| 189 | + and David Vogel}, |
| 190 | + volume = {7}, |
| 191 | + series = {Proceedings of Machine Learning Research}, |
| 192 | + address = {New York, New York, USA}, |
| 193 | + month = {28 Jun}, |
| 194 | + publisher = {PMLR}, |
| 195 | + pdf = {http://proceedings.mlr.press/v7/miller09/miller09.pdf}, |
| 196 | + url = {http://proceedings.mlr.press/v7/miller09.html}, |
| 197 | + abstract = {We discuss the challenges of the 2009 KDD Cup along with our ideas |
| 198 | + and methodologies for modelling the problem. The main stages included aggressive |
| 199 | + nonparametric feature selection, careful treatment of categorical variables and |
| 200 | + tuning a gradient boosting machine under Bernoulli loss with trees.} |
| 201 | +} |
| 202 | + |
| 203 | + |
| 204 | +@inproceedings{kanter2015deep, |
| 205 | + author = {James Max Kanter and Kalyan Veeramachaneni}, |
| 206 | + title = {Deep feature synthesis: Towards automating data science endeavors}, |
| 207 | + booktitle = {2015 {IEEE} International Conference on Data Science and Advanced |
| 208 | + Analytics, DSAA 2015, Paris, France, October 19-21, 2015}, |
| 209 | + pages = {1--10}, |
| 210 | + year = {2015}, |
| 211 | + organization={IEEE}, |
| 212 | + doi={10.1109/DSAA.2015.7344858}, |
| 213 | +} |
| 214 | + |
| 215 | + |
| 216 | +@article{christ_tsfresh, |
| 217 | + title = {Time Series FeatuRe Extraction on basis of Scalable Hypothesis tests |
| 218 | + (tsfresh – A Python package)}, |
| 219 | + journal = {Neurocomputing}, |
| 220 | + volume = {307}, |
| 221 | + pages = {72-77}, |
| 222 | + year = {2018}, |
| 223 | + issn = {0925-2312}, |
| 224 | + doi = {10.1016/j.neucom.2018.03.067}, |
| 225 | + url = {https://www.sciencedirect.com/science/article/pii/S0925231218304843}, |
| 226 | + author = {Maximilian Christ and Nils Braun and Julius Neuffer and |
| 227 | + Andreas W. Kempa-Liehr}, |
| 228 | + keywords = {Feature engineering, Time series, Feature extraction, Feature |
| 229 | + selection, Machine learning}, |
| 230 | + abstract = {Time series feature engineering is a time-consuming process because |
| 231 | + scientists and engineers have to consider the multifarious algorithms of signal |
| 232 | + processing and time series analysis for identifying and extracting meaningful |
| 233 | + features from time series. The Python package tsfresh (Time Series FeatuRe |
| 234 | + Extraction on basis of Scalable Hypothesis tests) accelerates this process |
| 235 | + by combining 63 time series characterization methods, which by default compute |
| 236 | + a total of 794 time series features, with feature selection on basis automatically |
| 237 | + configured hypothesis tests. By identifying statistically significant time series |
| 238 | + characteristics in an early stage of the data science process, tsfresh closes |
| 239 | + feedback loops with domain experts and fosters the development of domain |
| 240 | + specific features early on. The package implements standard APIs of time |
| 241 | + series and machine learning libraries (e.g. pandas and scikit-learn) and is |
| 242 | + designed for both exploratory analyses as well as straightforward integration |
| 243 | + into operational data science applications.} |
| 244 | +} |
| 245 | + |
| 246 | + |
| 247 | +@article{category_encoders, |
| 248 | + doi = {10.21105/joss.00501}, |
| 249 | + url = {https://doi.org/10.21105/joss.00501}, |
| 250 | + year = {2018}, |
| 251 | + publisher = {The Open Journal}, |
| 252 | + volume = {3}, |
| 253 | + number = {21}, |
| 254 | + pages = {501}, |
| 255 | + author = {William D. McGinnis and Chapman Siu and Andre S and Hanyu Huang}, |
| 256 | + title = {Category Encoders: a scikit-learn-contrib package of transformers |
| 257 | + for encoding categorical data}, |
| 258 | + journal = {Journal of Open Source Software} |
| 259 | +} |
| 260 | + |
0 commit comments