-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
367 lines (274 loc) · 13.6 KB
/
data_preprocessing.py
File metadata and controls
367 lines (274 loc) · 13.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import itertools
import re
import pandas as pd
import numpy as np
from catboost import Pool, FeaturesData
from constants import SCHOOLS_REVERSED, TARGET_LABELS
def _parse_str_nums(num_string):
"""
parse strings of numbers and take averages if there are multiple
:param num_string: a string of numbers and text
:type num_string: String
:return: float of the number found or average of multiple numbers found
:rtype: Float
:example:
>>> _parse_str_nums("40% to 50%")
>>> 45.
>>> _parse_str_nums("30%-50%")
>>> 40.
>>> _parse_str_nums("-20%")
>>> -20.
"""
num_string.upper().replace("ZERO", "0").replace("Forget it", "0")
# regex to find numbers
nums = re.findall(r'\d+', num_string)
# but if theres only one number, then we know its NOT a range and thus we can look for negative numbers
if len(nums) == 1:
nums = re.findall(r'[+-]?\d+(?:\.\d+)?', num_string)
# cast strings to ints
nums = [int(n) for n in nums]
# average ints derived from string
averaged = np.average(np.asarray(nums))
return averaged
def _squash_nested_lists(l_of_l):
"""
compress list of lists into one single list
:param l_of_l: list of lists
:type l_of_l: List
:return: single list with all elements of list of list
:rtype: List
:example:
>>> _squash_nested_list([['a','b'],['c'],['d','e']])
>>> ['a','b','c','d','e']
"""
return list(itertools.chain.from_iterable(l_of_l))
# TODO: do we care about case sensitivity?
def _preprocess_odds_string(string_of_odds):
"""
:param string_of_odds: string scraped from site describing an applicants odds of admittance
:type string_of_odds: String
:return: list of strings with entries for either schools or percent chances
:rtype: list
:example:
>>> _preprocess_odds_string("Harvard Business School: 85% Stanford: 80% Wharton: 90% Tuck: 95% Kellogg: 95%")
>>> ['Harvard Business School', '85', 'Stanford', '80', 'Wharton', '90', 'Tuck', '95', 'Kellogg', '95', '']
"""
# split on colons
divied_list_split_colon = string_of_odds.split(':')
# split on last occurrence of '%' using rsplit
divied_list_percent = [entry.rsplit('%', 1) for entry in divied_list_split_colon]
# recombine list of lists into one list of strings
divied_list_percent = _squash_nested_lists(divied_list_percent)
# split again on last occurence of new lines
# some snarky assessments have only text and no percent sign; i.e. "Forget it" or "Zero"
divied_list_of_lists = [entry.rsplit('\n', 1) for entry in divied_list_percent]
# recombine list of lists into one continuous list
compressed_divied_list = _squash_nested_lists(divied_list_of_lists)
# strip spaces for every entry
compressed_divied_list = [entry.strip() for entry in compressed_divied_list]
return compressed_divied_list
def _reduce_majors_dimensionality(data):
"""
The original dataset has a high number of majors specified
The dimensionality of the expanded numeric representation probably
hurts the model performance (in theory)
Thus we are reducing the dimensionality by combining all the stem into one category
and all the non stem into another category.
"""
stem_majors = ['Engineering', 'STEM']
# get all the majors that are not in the stem category
nonstem_majors = list(set(list(data.MAJOR.values)) - set(stem_majors))
majors_df = data.MAJOR
stem_replaced = majors_df.replace(to_replace=stem_majors, value=1.0)
new_majors_col = stem_replaced.replace(to_replace=nonstem_majors, value=0.0)
df_without_major_col = data.drop(['MAJOR'], axis=1, inplace=False)
reduced_df = df_without_major_col.join(pd.DataFrame({'STEM_MAJOR': new_majors_col}))
# print reduced_df
return reduced_df
def _reduce_race_dimensionality(data):
"""
The original dataset has a high number of races specified
The dimensionality of the expanded numeric representation probably
hurts the model performance (in theory)
Thus we are reducing the dimensionality by combining all the underrepresented into one category
and all the others into another
"""
underrepresented = ['Black', 'Latinx', 'Native American']
# get all the non-under represented races
non_underrepresented = list(set(list(data.RACE.values)) - set(underrepresented))
races_df = data.RACE
replace_races = races_df.replace(to_replace=underrepresented, value=1.0)
race_column = replace_races.replace(to_replace=non_underrepresented, value=0.0)
df_without_race_col = data.drop(['RACE'], axis=1, inplace=False)
reduced_df = df_without_race_col.join(pd.DataFrame({'UNDER_REP': race_column}))
return reduced_df
def _reduced_university_dimensionality(data):
"""
Use only binary classification. Tier 1 University Yes / No
"""
name_brand_schools = ['Tier 1', 'Tier 2']
small_schools = ['Tier 3']
uni_df = data.UNIVERSITY
replace_uni = uni_df.replace(to_replace=name_brand_schools, value=1.0)
uni_column = replace_uni.replace(to_replace=small_schools, value=0.0)
df_without_uni_col = data.drop(['UNIVERSITY'], axis=1, inplace=False)
reduced_df = df_without_uni_col.join(pd.DataFrame({'NAME_BRAND_SCHOOL': uni_column}))
return reduced_df
def _reduce_gender_dimensionality(data):
"""
Use only binary classification for simplifying dimensions
"""
gen_df = data.GENDER
replace_gen = gen_df.replace(to_replace=['Female'], value=1.0)
gen_column = replace_gen.replace(to_replace=['MALE'], value=0.0)
df_without_gen_col = data.drop(['GENDER'], axis=1, inplace=False)
reduced_df = df_without_gen_col.join(pd.DataFrame({'FEMALE': gen_column}))
return reduced_df
def _drop_unused_and_expand_categorical_columns(data):
"""
Drop data columns that were unused or have mostly NaNs
Expand categorical datas so they can be represented numerically
"""
# drop unused columns
data_after_drop = data.drop(['ODDS', 'INTERNATIONAL', 'JOBTITLE', 'AGE'], axis=1, inplace=False)
# dropped_data = data.drop(['ODDS','INTERNATIONAL','JOBTITLE','UNIVERSITY','MAJOR','GENDER','RACE'],axis=1,inplace=False)
# #change categorical data into numeric
# categorical_cols = ['UNIVERSITY','MAJOR','GENDER','RACE']
# # categorical_cols = []
# df_processed = pd.get_dummies(data=data_after_drop,columns=categorical_cols)
return data_after_drop
def preprocess_data_4_catboost(data_df, output_path=None):
"""
preprocess data for working with gradient boosting techniques
specifically with the catboost library. since this is going to use
the preprocessing built into the catboost library there are slightly
different steps to be done
"""
"""
train_data = Pool(
data=FeaturesData(
num_feature_data=np.array([[1, 4, 5, 6],
[4, 5, 6, 7],
[30, 40, 50, 60]],
dtype=np.float32),
cat_feature_data=np.array([[b"a", b"b"],
[b"a", b"b"],
[b"c", b"d"]],
dtype=object)
),
label=[1, 1, -1]
)
"""
new_df_w_labels = data_df.copy()
for idx, odds_string in data_df.ODDS.iteritems():
# skip data qual errors and abnormalities
if not isinstance(odds_string, str):
continue
divied_list = _preprocess_odds_string(odds_string)
for school_or_perc in divied_list:
if school_or_perc in SCHOOLS_REVERSED.keys():
school_idx = divied_list.index(school_or_perc)
# the percent is always the next index after the school
perc = divied_list[school_idx + 1]
# print "School: {};Odds: {}".format(school_or_perc,perc)
# use the standardized name
standard_school_name = SCHOOLS_REVERSED[school_or_perc]
# insert the specific name value for the correct row
new_df_w_labels.at[idx, standard_school_name] = _parse_str_nums(perc)
new_df_w_labels = _reduce_majors_dimensionality(new_df_w_labels)
# drop unused columns
data_after_drop = new_df_w_labels.drop(['ODDS', 'INTERNATIONAL', 'JOBTITLE'], axis=1, inplace=False)
# change categorical data into numeric
categorical_cols = ['UNIVERSITY', 'MAJOR', 'GENDER', 'RACE']
# a dataframe of ONLY the features
features_only_df = data_after_drop.drop(TARGET_LABELS, axis=1, inplace=False)
# determine the columns that are features by subtracting from labels
feature_cols = set(data_after_drop.columns) - set(TARGET_LABELS)
# a dataframe with ONLY labels
labels = data_after_drop.drop(feature_cols, axis=1, inplace=False)
multi_data_set_dict = {}
for school in labels.columns:
df_for_school = features_only_df.join(pd.DataFrame({school: labels[school]}))
# a holder dictionary that contains the features numpy ndarray for features and numpy ndarray for school label
school_dict = {}
# drop the NaNs from the dataset in any feature column or label. otherwise model training will fail
df_for_school.dropna(inplace=True)
# store the features as a numpy ndarray to be fed directly to model training
numerical_features_np_array = df_for_school.drop([school] + categorical_cols, axis=1, inplace=False).values
categorical_features_np_array = df_for_school[categorical_cols].values
# store the labels for a particular school as a numpy ndarray to be fed directly to model training
labels_as_list = df_for_school.drop(feature_cols, axis=1, inplace=False)[school].tolist()
datasetpool = Pool(
data=FeaturesData(
num_feature_data=np.array(numerical_features_np_array,
dtype=np.float32),
cat_feature_data=np.array(categorical_features_np_array,
dtype=object)
),
label=labels_as_list
)
multi_data_set_dict[school] = datasetpool
return multi_data_set_dict
def preprocess_data(data_df, output_path=None):
"""
preprocess data for general regression modeling
combines many steps such as working with the odds strings
and one hot encoding categorical features
input is a pandas dataframe of features and labels
output is a dictionary of datasets, where each key
is the feature set + lables for one school.
Since each school uses its own model, each school also needs its
own set of features/labels
"""
new_df_w_labels = data_df.copy()
for idx, odds_string in data_df.ODDS.iteritems():
# skip data qual errors and abnormalities
if isinstance(odds_string, bytes):
odds_string = odds_string.decode("utf-8")
elif not isinstance(odds_string, str):
continue
else:
print(odds_string)
print(type(odds_string))
divied_list = _preprocess_odds_string(odds_string)
for school_or_perc in divied_list:
if school_or_perc in SCHOOLS_REVERSED.keys():
school_idx = divied_list.index(school_or_perc)
perc = divied_list[school_idx + 1]
# print "School: {};Odds: {}".format(school_or_perc,perc)
# use the standardized name
standard_school_name = SCHOOLS_REVERSED[school_or_perc]
# insert the specific name value for the correct row
new_df_w_labels.at[idx, standard_school_name] = _parse_str_nums(perc)
# dataset currently has a ton of majors as categories. try combining them into STEM/NonSTEM to reduce dimensionality
new_df_w_labels = _reduce_majors_dimensionality(new_df_w_labels)
new_df_w_labels = _reduce_race_dimensionality(new_df_w_labels)
new_df_w_labels = _reduced_university_dimensionality(new_df_w_labels)
new_df_w_labels = _reduce_gender_dimensionality(new_df_w_labels)
df_processed = _drop_unused_and_expand_categorical_columns(new_df_w_labels)
# write dataframe to csv after processing for debugging and things
if output_path:
df_processed.to_csv(output_path)
# a dataframe of ONLY the features
features_only_df = df_processed.drop(TARGET_LABELS, axis=1, inplace=False)
# determine the columns that are features by subtracting from labels
feature_cols = set(df_processed.columns) - set(TARGET_LABELS)
# a dataframe with ONLY labels
labels = df_processed.drop(feature_cols, axis=1, inplace=False)
multi_data_set_dict = {}
# create a new dataset for each school that we are modeling
for school in labels.columns:
# create a dataframe with all the features and the labels for a particular school
df_for_school = features_only_df.join(pd.DataFrame({school: labels[school]}))
# a holder dictionary that contains the features numpy ndarray for features and numpy ndarray for school label
school_dict = {}
# drop the NaNs from the dataset in any feature column or label. otherwise model training will fail
df_for_school.dropna(inplace=True)
# store the features as a numpy ndarray to be fed directly to model training
school_dict['features'] = df_for_school.drop([school], axis=1, inplace=False)
# store the labels for a particular school as a numpy ndarray to be fed directly to model training
school_dict['labels'] = df_for_school.drop(feature_cols, axis=1, inplace=False)
# store the FEATURES & LABELS for a PARTICULAR SCHOOL in the dictionary
multi_data_set_dict[school] = school_dict
feature_col_names = features_only_df.columns
return multi_data_set_dict, feature_col_names