1
1
import pandas as pd
2
- import featuretools as ft
3
-
4
- from featuretools .variable_types import ZIPCode , Index , Datetime , Numeric , DatetimeTimeIndex , Categorical , Id , \
5
- SubRegionCode
2
+ import pytest
6
3
from pandas .util .testing import assert_frame_equal
7
4
8
- from autonormalize import classes , normalize , autonormalize
9
-
5
+ import featuretools as ft
6
+ from featuretools .variable_types import (
7
+ Categorical ,
8
+ Datetime ,
9
+ DatetimeTimeIndex ,
10
+ Id ,
11
+ Index ,
12
+ Numeric ,
13
+ Text ,
14
+ ZIPCode
15
+ )
16
+
17
+ from autonormalize import autonormalize , classes , normalize
10
18
11
19
# from classes import Dependencies
12
20
13
21
# from normalize import normalize, find_most_comm, split_on_dep
14
22
23
+ @pytest .fixture
24
+ def teams_input ():
25
+ class Teams :
26
+ def get_df (self ):
27
+ dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
28
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
29
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
30
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
31
+ 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
32
+ 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
33
+ 'state' : ['MA' , 'MA' , 'MA' , 'IL' , 'IL' , 'HI' , 'HI' , 'MA' , 'MA' , 'TX' ]}
34
+ return pd .DataFrame (dic )
35
+
36
+ def get_deps (self ):
37
+ return classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
38
+ 'jersey_num' : [['player_name' , 'team' ]],
39
+ 'player_name' : [['team' , 'jersey_num' ]],
40
+ 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
41
+ 'state' : [['team' ], ['player_name' , 'jersey_num' ],
42
+ ['city' ]]}, ['team' , 'jersey_num' ])
43
+ return Teams ()
44
+
15
45
16
46
def test_normalize ():
17
47
# how to test that relations remain the same???
@@ -105,23 +135,8 @@ def test_choose_index():
105
135
assert normalize .choose_index (keys , df ) == ['A' , 'B' ]
106
136
107
137
108
- def test_normalize_dataframe ():
109
-
110
- dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
111
- 'Yellow' , 'Green' , 'Green' , 'Blue' ],
112
- 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
113
- 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
114
- 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
115
- 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
116
- 'state' : ['MA' , 'MA' , 'MA' , 'IL' , 'IL' , 'HI' , 'HI' , 'MA' , 'MA' , 'TX' ]}
117
- df = pd .DataFrame (dic )
118
- deps = classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
119
- 'jersey_num' : [['player_name' , 'team' ]],
120
- 'player_name' : [['team' , 'jersey_num' ]],
121
- 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
122
- 'state' : [['team' ], ['player_name' , 'jersey_num' ], ['city' ]]}, ['team' , 'jersey_num' ])
123
-
124
- depdf = normalize .DepDF (deps , df , deps .get_prim_key ())
138
+ def test_normalize_dataframe (teams_input ):
139
+ depdf = normalize .DepDF (teams_input .get_deps (), teams_input .get_df (), teams_input .get_deps ().get_prim_key ())
125
140
normalize .normalize_dataframe (depdf )
126
141
new_dfs = depdf .return_dfs ()
127
142
@@ -216,21 +231,8 @@ def test_variable_types():
216
231
assert normalized_entityset ['customer_id' ].variable_types ['zip_code' ] == ZIPCode
217
232
218
233
219
- def test_make_entityset_default_args ():
220
- dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
221
- 'Yellow' , 'Green' , 'Green' , 'Blue' ],
222
- 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
223
- 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
224
- 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
225
- 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
226
- 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
227
- df = pd .DataFrame (dic )
228
- deps = classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
229
- 'jersey_num' : [['player_name' , 'team' ]],
230
- 'player_name' : [['team' , 'jersey_num' ]],
231
- 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
232
- 'state' : [['team' ], ['player_name' , 'jersey_num' ], ['city' ]]}, ['team' , 'jersey_num' ])
233
- normalized_entityset = autonormalize .make_entityset (df , deps )
234
+ def test_make_entityset_default_args (teams_input ):
235
+ normalized_entityset = autonormalize .make_entityset (teams_input .get_df (), teams_input .get_deps ())
234
236
235
237
dic_one = {'team_jersey_num' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
236
238
'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
@@ -242,7 +244,7 @@ def test_make_entityset_default_args():
242
244
'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
243
245
244
246
dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
245
- 'state' : ['US- TX' , 'US- MA' , 'US- IL' , 'US- HI' , ]}
247
+ 'state' : ['TX' , 'MA' , 'IL' , 'HI' ]}
246
248
247
249
assert len (normalized_entityset .entities ) == 3
248
250
@@ -264,24 +266,11 @@ def test_make_entityset_default_args():
264
266
assert normalized_entityset .entities [2 ].variable_types ['state' ] == Categorical
265
267
266
268
267
- def test_make_entityset_custom_args ():
268
- dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
269
- 'Yellow' , 'Green' , 'Green' , 'Blue' ],
270
- 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
271
- 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
272
- 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
273
- 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
274
- 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
275
- df = pd .DataFrame (dic )
276
- deps = classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
277
- 'jersey_num' : [['player_name' , 'team' ]],
278
- 'player_name' : [['team' , 'jersey_num' ]],
279
- 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
280
- 'state' : [['team' ], ['player_name' , 'jersey_num' ], ['city' ]]}, ['team' , 'jersey_num' ])
281
- normalized_entityset = autonormalize .make_entityset (df = df ,
282
- dependencies = deps ,
283
- name = 'Sport' ,
284
- variable_types = {'state' : SubRegionCode })
269
+ def test_make_entityset_custom_args (teams_input ):
270
+ normalized_entityset = autonormalize .make_entityset (df = teams_input .get_df (),
271
+ dependencies = teams_input .get_deps (),
272
+ name = 'Teams' ,
273
+ variable_types = {'state' : Text })
285
274
286
275
dic_one = {'team_jersey_num' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
287
276
'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
@@ -293,10 +282,10 @@ def test_make_entityset_custom_args():
293
282
'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
294
283
295
284
dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
296
- 'state' : ['US- TX' , 'US- MA' , 'US- IL' , 'US- HI' , ]}
285
+ 'state' : ['TX' , 'MA' , 'IL' , 'HI' ]}
297
286
298
287
assert len (normalized_entityset .entities ) == 3
299
- assert normalized_entityset .id == 'Sport '
288
+ assert normalized_entityset .id == 'Teams '
300
289
301
290
assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
302
291
assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
@@ -313,19 +302,11 @@ def test_make_entityset_custom_args():
313
302
assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
314
303
315
304
assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
316
- assert normalized_entityset .entities [2 ].variable_types ['state' ] == SubRegionCode
305
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == Text
317
306
318
307
319
- def test_auto_entityset_default_args ():
320
- dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
321
- 'Yellow' , 'Green' , 'Green' , 'Blue' ],
322
- 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
323
- 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
324
- 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
325
- 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
326
- 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
327
- df = pd .DataFrame (dic )
328
- normalized_entityset = autonormalize .auto_entityset (df )
308
+ def test_auto_entityset_default_args (teams_input ):
309
+ normalized_entityset = autonormalize .auto_entityset (teams_input .get_df ())
329
310
330
311
dic_one = {'jersey_num_team' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
331
312
'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
@@ -337,7 +318,7 @@ def test_auto_entityset_default_args():
337
318
'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
338
319
339
320
dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
340
- 'state' : ['US- TX' , 'US- MA' , 'US- IL' , 'US- HI' , ]}
321
+ 'state' : ['TX' , 'MA' , 'IL' , 'HI' ]}
341
322
342
323
assert len (normalized_entityset .entities ) == 3
343
324
@@ -359,20 +340,12 @@ def test_auto_entityset_default_args():
359
340
assert normalized_entityset .entities [2 ].variable_types ['state' ] == Categorical
360
341
361
342
362
- def test_auto_entityset_custom_args ():
363
- dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
364
- 'Yellow' , 'Green' , 'Green' , 'Blue' ],
365
- 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
366
- 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
367
- 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
368
- 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
369
- 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
370
- df = pd .DataFrame (dic )
371
- normalized_entityset = autonormalize .auto_entityset (df = df ,
372
- name = 'Sport' ,
373
- variable_types = {'state' : SubRegionCode })
343
+ def test_auto_entityset_custom_args (teams_input ):
344
+ normalized_entityset = autonormalize .auto_entityset (df = teams_input .get_df (),
345
+ name = 'Teams' ,
346
+ variable_types = {'state' : Text })
374
347
375
- dic_one = {'team_jersey_num ' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
348
+ dic_one = {'jersey_num_team ' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
376
349
'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
377
350
'Yellow' , 'Green' , 'Green' , 'Blue' ],
378
351
'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
@@ -382,10 +355,10 @@ def test_auto_entityset_custom_args():
382
355
'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
383
356
384
357
dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
385
- 'state' : ['US- TX' , 'US- MA' , 'US- IL' , 'US- HI' , ]}
358
+ 'state' : ['TX' , 'MA' , 'IL' , 'HI' ]}
386
359
387
360
assert len (normalized_entityset .entities ) == 3
388
- assert normalized_entityset .id == 'Sport '
361
+ assert normalized_entityset .id == 'Teams '
389
362
390
363
assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
391
364
assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
@@ -402,4 +375,4 @@ def test_auto_entityset_custom_args():
402
375
assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
403
376
404
377
assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
405
- assert normalized_entityset .entities [2 ].variable_types ['state' ] == SubRegionCode
378
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == Text
0 commit comments