1
1
import pandas as pd
2
2
import featuretools as ft
3
3
4
- from featuretools .variable_types import ZIPCode , Index , Datetime , Numeric , DatetimeTimeIndex , Categorical , Id
4
+ from featuretools .variable_types import ZIPCode , Index , Datetime , Numeric , DatetimeTimeIndex , Categorical , Id , \
5
+ SubRegionCode
5
6
from pandas .util .testing import assert_frame_equal
6
7
7
8
from autonormalize import classes , normalize , autonormalize
@@ -191,7 +192,7 @@ def test_variable_types():
191
192
entityset .entity_from_dataframe (entity_id = 'Customer Transactions' ,
192
193
dataframe = df ,
193
194
time_index = 'transaction_time' ,
194
- variable_types = {" zip_code" : ZIPCode })
195
+ variable_types = {' zip_code' : ZIPCode })
195
196
196
197
normalized_entityset = autonormalize .normalize_entity (entityset )
197
198
@@ -213,3 +214,192 @@ def test_variable_types():
213
214
assert normalized_entityset ['customer_id' ].variable_types ['join_date' ] == Datetime
214
215
assert normalized_entityset ['customer_id' ].variable_types ['date_of_birth' ] == Datetime
215
216
assert normalized_entityset ['customer_id' ].variable_types ['zip_code' ] == ZIPCode
217
+
218
+
219
+ def test_make_entityset_default_args ():
220
+ dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
221
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
222
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
223
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
224
+ 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
225
+ 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
226
+ 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
227
+ df = pd .DataFrame (dic )
228
+ deps = classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
229
+ 'jersey_num' : [['player_name' , 'team' ]],
230
+ 'player_name' : [['team' , 'jersey_num' ]],
231
+ 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
232
+ 'state' : [['team' ], ['player_name' , 'jersey_num' ], ['city' ]]}, ['team' , 'jersey_num' ])
233
+ normalized_entityset = autonormalize .make_entityset (df , deps )
234
+
235
+ dic_one = {'team_jersey_num' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
236
+ 'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
237
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
238
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
239
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ]}
240
+
241
+ dic_two = {'team' : ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ],
242
+ 'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
243
+
244
+ dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
245
+ 'state' : ['US-TX' , 'US-MA' , 'US-IL' , 'US-HI' , ]}
246
+
247
+ assert len (normalized_entityset .entities ) == 3
248
+
249
+ assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
250
+ assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
251
+ dic_two , index = ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ]))
252
+ assert normalized_entityset .entities [2 ].df .equals (pd .DataFrame (
253
+ dic_three , index = ['austin' , 'boston' , 'chicago' , 'honolulu' ]))
254
+
255
+ assert normalized_entityset .entities [0 ].variable_types ['team_jersey_num' ] == Index
256
+ assert normalized_entityset .entities [0 ].variable_types ['team' ] == Id
257
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num' ] == Numeric
258
+ assert normalized_entityset .entities [0 ].variable_types ['player_name' ] == Categorical
259
+
260
+ assert normalized_entityset .entities [1 ].variable_types ['team' ] == Index
261
+ assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
262
+
263
+ assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
264
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == Categorical
265
+
266
+
267
+ def test_make_entityset_custom_args ():
268
+ dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
269
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
270
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
271
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
272
+ 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
273
+ 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
274
+ 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
275
+ df = pd .DataFrame (dic )
276
+ deps = classes .Dependencies ({'team' : [['player_name' , 'jersey_num' ]],
277
+ 'jersey_num' : [['player_name' , 'team' ]],
278
+ 'player_name' : [['team' , 'jersey_num' ]],
279
+ 'city' : [['team' ], ['state' ], ['player_name' , 'jersey_num' ]],
280
+ 'state' : [['team' ], ['player_name' , 'jersey_num' ], ['city' ]]}, ['team' , 'jersey_num' ])
281
+ normalized_entityset = autonormalize .make_entityset (df = df ,
282
+ dependencies = deps ,
283
+ name = 'Sport' ,
284
+ variable_types = {'state' : SubRegionCode })
285
+
286
+ dic_one = {'team_jersey_num' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
287
+ 'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
288
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
289
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
290
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ]}
291
+
292
+ dic_two = {'team' : ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ],
293
+ 'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
294
+
295
+ dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
296
+ 'state' : ['US-TX' , 'US-MA' , 'US-IL' , 'US-HI' , ]}
297
+
298
+ assert len (normalized_entityset .entities ) == 3
299
+ assert normalized_entityset .id == 'Sport'
300
+
301
+ assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
302
+ assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
303
+ dic_two , index = ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ]))
304
+ assert normalized_entityset .entities [2 ].df .equals (pd .DataFrame (
305
+ dic_three , index = ['austin' , 'boston' , 'chicago' , 'honolulu' ]))
306
+
307
+ assert normalized_entityset .entities [0 ].variable_types ['team_jersey_num' ] == Index
308
+ assert normalized_entityset .entities [0 ].variable_types ['team' ] == Id
309
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num' ] == Numeric
310
+ assert normalized_entityset .entities [0 ].variable_types ['player_name' ] == Categorical
311
+
312
+ assert normalized_entityset .entities [1 ].variable_types ['team' ] == Index
313
+ assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
314
+
315
+ assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
316
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == SubRegionCode
317
+
318
+
319
+ def test_auto_entityset_default_args ():
320
+ dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
321
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
322
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
323
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
324
+ 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
325
+ 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
326
+ 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
327
+ df = pd .DataFrame (dic )
328
+ normalized_entityset = autonormalize .auto_entityset (df )
329
+
330
+ dic_one = {'jersey_num_team' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
331
+ 'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
332
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
333
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
334
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ]}
335
+
336
+ dic_two = {'team' : ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ],
337
+ 'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
338
+
339
+ dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
340
+ 'state' : ['US-TX' , 'US-MA' , 'US-IL' , 'US-HI' , ]}
341
+
342
+ assert len (normalized_entityset .entities ) == 3
343
+
344
+ assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
345
+ assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
346
+ dic_two , index = ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ]))
347
+ assert normalized_entityset .entities [2 ].df .equals (pd .DataFrame (
348
+ dic_three , index = ['austin' , 'boston' , 'chicago' , 'honolulu' ]))
349
+
350
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num_team' ] == Index
351
+ assert normalized_entityset .entities [0 ].variable_types ['team' ] == Id
352
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num' ] == Numeric
353
+ assert normalized_entityset .entities [0 ].variable_types ['player_name' ] == Categorical
354
+
355
+ assert normalized_entityset .entities [1 ].variable_types ['team' ] == Index
356
+ assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
357
+
358
+ assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
359
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == Categorical
360
+
361
+
362
+ def test_auto_entityset_custom_args ():
363
+ dic = {'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
364
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
365
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
366
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ],
367
+ 'city' : ['boston' , 'boston' , 'boston' , 'chicago' , 'chicago' ,
368
+ 'honolulu' , 'honolulu' , 'boston' , 'boston' , 'austin' ],
369
+ 'state' : ['US-MA' , 'US-MA' , 'US-MA' , 'US-IL' , 'US-IL' , 'US-HI' , 'US-HI' , 'US-MA' , 'US-MA' , 'US-TX' ]}
370
+ df = pd .DataFrame (dic )
371
+ normalized_entityset = autonormalize .auto_entityset (df = df ,
372
+ name = 'Sport' ,
373
+ variable_types = {'state' : SubRegionCode })
374
+
375
+ dic_one = {'team_jersey_num' : [0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ],
376
+ 'team' : ['Red' , 'Red' , 'Red' , 'Orange' , 'Orange' , 'Yellow' ,
377
+ 'Yellow' , 'Green' , 'Green' , 'Blue' ],
378
+ 'jersey_num' : [1 , 2 , 3 , 1 , 2 , 1 , 5 , 8 , 2 , 2 ],
379
+ 'player_name' : ['A' , 'B' , 'C' , 'D' , 'A' , 'E' , 'B' , 'A' , 'G' , 'H' ]}
380
+
381
+ dic_two = {'team' : ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ],
382
+ 'city' : ['austin' , 'boston' , 'chicago' , 'boston' , 'honolulu' ]}
383
+
384
+ dic_three = {'city' : ['austin' , 'boston' , 'chicago' , 'honolulu' ],
385
+ 'state' : ['US-TX' , 'US-MA' , 'US-IL' , 'US-HI' , ]}
386
+
387
+ assert len (normalized_entityset .entities ) == 3
388
+ assert normalized_entityset .id == 'Sport'
389
+
390
+ assert normalized_entityset .entities [0 ].df .equals (pd .DataFrame (dic_one ))
391
+ assert normalized_entityset .entities [1 ].df .equals (pd .DataFrame (
392
+ dic_two , index = ['Blue' , 'Green' , 'Orange' , 'Red' , 'Yellow' ]))
393
+ assert normalized_entityset .entities [2 ].df .equals (pd .DataFrame (
394
+ dic_three , index = ['austin' , 'boston' , 'chicago' , 'honolulu' ]))
395
+
396
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num_team' ] == Index
397
+ assert normalized_entityset .entities [0 ].variable_types ['team' ] == Id
398
+ assert normalized_entityset .entities [0 ].variable_types ['jersey_num' ] == Numeric
399
+ assert normalized_entityset .entities [0 ].variable_types ['player_name' ] == Categorical
400
+
401
+ assert normalized_entityset .entities [1 ].variable_types ['team' ] == Index
402
+ assert normalized_entityset .entities [1 ].variable_types ['city' ] == Id
403
+
404
+ assert normalized_entityset .entities [2 ].variable_types ['city' ] == Index
405
+ assert normalized_entityset .entities [2 ].variable_types ['state' ] == SubRegionCode
0 commit comments