From 561b33f3747ee4a829af56b87d939c25371a1969 Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Mon, 30 Sep 2019 12:32:54 +1000 Subject: [PATCH 1/6] Adding variable types to autonormalize --- autonormalize/autonormalize.py | 27 ++++++++++++++++++++------- autonormalize/tests/test_normalize.py | 19 ++++++++++++++++++- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py index 945f959..8ee290c 100644 --- a/autonormalize/autonormalize.py +++ b/autonormalize/autonormalize.py @@ -70,7 +70,7 @@ def normalize_dataframe(df, dependencies): return depdf.return_dfs() -def make_entityset(df, dependencies, name=None, time_index=None): +def make_entityset(df, dependencies, name=None, time_index=None, variable_types=None): """ Creates a normalized EntitySet from df based on the dependencies given. Keys for the newly created DataFrames can only be columns that are strings, @@ -82,6 +82,10 @@ def make_entityset(df, dependencies, name=None, time_index=None): df (pd.DataFrame) : dataframe to normalize and make entity set from dependencies (Dependenies) : the dependencies discovered in df name (str, optional) : the name of created EntitySet + time_index (str, optional) : name of time column in the dataframe + variable_types (dict[str -> Variable], optional): + Keys are of variable ids and values are variable types. Used to + initialize an entity's store. Returns: entityset (ft.EntitySet) : created entity set @@ -97,10 +101,14 @@ def make_entityset(df, dependencies, name=None, time_index=None): while stack != []: current = stack.pop() + if variable_types is not None: + entity_variable_types = {col: variable_types[col] for col in current.df.columns if col in variable_types} + else: + entity_variable_types = None if time_index in current.df.columns: - entities[current.index[0]] = (current.df, current.index[0], time_index) + entities[current.index[0]] = (current.df, current.index[0], time_index, entity_variable_types) else: - entities[current.index[0]] = (current.df, current.index[0]) + entities[current.index[0]] = (current.df, current.index[0], None, entity_variable_types) for child in current.children: # add to stack # add relationship @@ -110,7 +118,7 @@ def make_entityset(df, dependencies, name=None, time_index=None): return ft.EntitySet(name, entities, relationships) -def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None): +def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None): """ Creates a normalized entityset from a dataframe. @@ -126,13 +134,17 @@ def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None): name (str, optional) : the name of created EntitySet - time_index (str, optional) : name of time column in the dataframe. + time_index (str, optional) : name of time column in the dataframe + + variable_types (dict[str -> Variable], optional): + Keys are of variable ids and values are variable types. Used to + initialize an entity's store Returns: entityset (ft.EntitySet) : created entity set """ - return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index) + return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index, variable_types) def auto_normalize(df): @@ -169,5 +181,6 @@ def normalize_entity(es, accuracy=0.98): if len(es.entities) == 0: raise ValueError('This EntitySet is empty') entity = es.entities[0] - new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index) + new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index, + variable_types=entity.variable_types) return new_es diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 4956d82..e104fe1 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,7 +1,11 @@ import pandas as pd +import featuretools as ft + +from featuretools.variable_types import ZIPCode from pandas.util.testing import assert_frame_equal -from autonormalize import classes, normalize +from autonormalize import classes, normalize, autonormalize + # from classes import Dependencies @@ -178,3 +182,16 @@ def test_make_indexes(): assert new_dfs[0][new_dfs[1].columns[0]][5] == val assert new_dfs[0][new_dfs[1].columns[0]][6] == val assert new_dfs[0][new_dfs[1].columns[0]][7] == val + + +def test_variable_types(): + df = ft.demo.load_mock_customer(n_customers=20, n_products=12, n_sessions=50, + n_transactions=100, return_single_table=True) + entityset = ft.EntitySet() + entityset.entity_from_dataframe(entity_id='Customer Transactions', + dataframe=df, + time_index='transaction_time', + variable_types={"zip_code": ZIPCode}) + + normalized_entityset = autonormalize.normalize_entity(entityset) + assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode From 05ab955b0686513a482961660dd72eb6c4e7a91c Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Thu, 10 Oct 2019 15:41:18 +1100 Subject: [PATCH 2/6] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 22f669a..d54d5a6 100755 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pip uninstall autonormalize ### `auto_entityset` ```shell -auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None) +auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None) ``` Creates a normalized entityset from a dataframe. @@ -49,8 +49,10 @@ Creates a normalized entityset from a dataframe. * `name` (str, optional) : the name of created EntitySet -* `time_index` (str, optional) : name of time column in the dataframe. +* `time_index` (str, optional) : name of time column in the dataframe +* `variable_types` (dict[str -> Variable], optional) : Keys are of variable ids and values are variable types. Used to initialize an entity's store. + **Returns:** * `entityset` (ft.EntitySet) : created entity set @@ -85,7 +87,7 @@ Normalizes dataframe based on the dependencies given. Keys for the newly created ### `make_entityset` ```shell -make_entityset(df, dependencies, name=None, time_index=None) +make_entityset(df, dependencies, name=None, time_index=None, variable_types=None) ``` Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute. From db8257a42c809d119af2ca27aae1e7cb4ebc1180 Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Sun, 15 Mar 2020 16:37:55 +1100 Subject: [PATCH 3/6] Testing variable types of all columns in normalized entityset --- autonormalize/tests/test_normalize.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index e104fe1..db42947 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,7 +1,7 @@ import pandas as pd import featuretools as ft -from featuretools.variable_types import ZIPCode +from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id from pandas.util.testing import assert_frame_equal from autonormalize import classes, normalize, autonormalize @@ -194,4 +194,22 @@ def test_variable_types(): variable_types={"zip_code": ZIPCode}) normalized_entityset = autonormalize.normalize_entity(entityset) + + assert normalized_entityset['transaction_id'].variable_types['transaction_id'] == Index + assert normalized_entityset['transaction_id'].variable_types['session_id'] == Id + assert normalized_entityset['transaction_id'].variable_types['transaction_time'] == DatetimeTimeIndex + assert normalized_entityset['transaction_id'].variable_types['product_id'] == Id + assert normalized_entityset['transaction_id'].variable_types['amount'] == Numeric + + assert normalized_entityset['product_id'].variable_types['product_id'] == Index + assert normalized_entityset['product_id'].variable_types['brand'] == Categorical + + assert normalized_entityset['session_id'].variable_types['session_id'] == Index + assert normalized_entityset['session_id'].variable_types['customer_id'] == Id + assert normalized_entityset['session_id'].variable_types['device'] == Categorical + assert normalized_entityset['session_id'].variable_types['session_start'] == Datetime + + assert normalized_entityset['customer_id'].variable_types['customer_id'] == Index + assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime + assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode From 46c2a192e444d220afeaa195a540f1c41a626d9a Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Mon, 13 Apr 2020 14:48:46 +1000 Subject: [PATCH 4/6] Initial tests for make_entityset and auto_entityset --- autonormalize/tests/test_normalize.py | 194 +++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 2 deletions(-) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index db42947..6007035 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,7 +1,8 @@ import pandas as pd import featuretools as ft -from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id +from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id, \ + SubRegionCode from pandas.util.testing import assert_frame_equal from autonormalize import classes, normalize, autonormalize @@ -191,7 +192,7 @@ def test_variable_types(): entityset.entity_from_dataframe(entity_id='Customer Transactions', dataframe=df, time_index='transaction_time', - variable_types={"zip_code": ZIPCode}) + variable_types={'zip_code': ZIPCode}) normalized_entityset = autonormalize.normalize_entity(entityset) @@ -213,3 +214,192 @@ def test_variable_types(): assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode + + +def test_make_entityset_default_args(): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} + df = pd.DataFrame(dic) + deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], + 'jersey_num': [['player_name', 'team']], + 'player_name': [['team', 'jersey_num']], + 'city': [['team'], ['state'], ['player_name', 'jersey_num']], + 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) + normalized_entityset = autonormalize.make_entityset(df, deps) + + dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + + assert len(normalized_entityset.entities) == 3 + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Categorical + + +def test_make_entityset_custom_args(): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} + df = pd.DataFrame(dic) + deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], + 'jersey_num': [['player_name', 'team']], + 'player_name': [['team', 'jersey_num']], + 'city': [['team'], ['state'], ['player_name', 'jersey_num']], + 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) + normalized_entityset = autonormalize.make_entityset(df=df, + dependencies=deps, + name='Sport', + variable_types={'state': SubRegionCode}) + + dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + + assert len(normalized_entityset.entities) == 3 + assert normalized_entityset.id == 'Sport' + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode + + +def test_auto_entityset_default_args(): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} + df = pd.DataFrame(dic) + normalized_entityset = autonormalize.auto_entityset(df) + + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + + assert len(normalized_entityset.entities) == 3 + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == Categorical + + +def test_auto_entityset_custom_args(): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} + df = pd.DataFrame(dic) + normalized_entityset = autonormalize.auto_entityset(df=df, + name='Sport', + variable_types={'state': SubRegionCode}) + + dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']} + + dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'], + 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} + + dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], + 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + + assert len(normalized_entityset.entities) == 3 + assert normalized_entityset.id == 'Sport' + + assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) + assert normalized_entityset.entities[1].df.equals(pd.DataFrame( + dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow'])) + assert normalized_entityset.entities[2].df.equals(pd.DataFrame( + dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) + + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index + assert normalized_entityset.entities[0].variable_types['team'] == Id + assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric + assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical + + assert normalized_entityset.entities[1].variable_types['team'] == Index + assert normalized_entityset.entities[1].variable_types['city'] == Id + + assert normalized_entityset.entities[2].variable_types['city'] == Index + assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode From 5a66f942457a7c31d78a80c8f57a92a32e6aa1f6 Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Sat, 18 Apr 2020 14:59:06 +1000 Subject: [PATCH 5/6] Adding pytest fixture for teams example --- autonormalize/tests/test_normalize.py | 147 +++++++++++--------------- 1 file changed, 60 insertions(+), 87 deletions(-) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index 6007035..a323ecf 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -1,17 +1,47 @@ import pandas as pd -import featuretools as ft - -from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id, \ - SubRegionCode +import pytest from pandas.util.testing import assert_frame_equal -from autonormalize import classes, normalize, autonormalize - +import featuretools as ft +from featuretools.variable_types import ( + Categorical, + Datetime, + DatetimeTimeIndex, + Id, + Index, + Numeric, + Text, + ZIPCode +) + +from autonormalize import autonormalize, classes, normalize # from classes import Dependencies # from normalize import normalize, find_most_comm, split_on_dep +@pytest.fixture +def teams_input(): + class Teams: + def get_df(self): + dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', + 'Yellow', 'Green', 'Green', 'Blue'], + 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], + 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], + 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', + 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], + 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']} + return pd.DataFrame(dic) + + def get_deps(self): + return classes.Dependencies({'team': [['player_name', 'jersey_num']], + 'jersey_num': [['player_name', 'team']], + 'player_name': [['team', 'jersey_num']], + 'city': [['team'], ['state'], ['player_name', 'jersey_num']], + 'state': [['team'], ['player_name', 'jersey_num'], + ['city']]}, ['team', 'jersey_num']) + return Teams() + def test_normalize(): # how to test that relations remain the same??? @@ -105,23 +135,8 @@ def test_choose_index(): assert normalize.choose_index(keys, df) == ['A', 'B'] -def test_normalize_dataframe(): - - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']} - df = pd.DataFrame(dic) - deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], - 'jersey_num': [['player_name', 'team']], - 'player_name': [['team', 'jersey_num']], - 'city': [['team'], ['state'], ['player_name', 'jersey_num']], - 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) - - depdf = normalize.DepDF(deps, df, deps.get_prim_key()) +def test_normalize_dataframe(teams_input): + depdf = normalize.DepDF(teams_input.get_deps(), teams_input.get_df(), teams_input.get_deps().get_prim_key()) normalize.normalize_dataframe(depdf) new_dfs = depdf.return_dfs() @@ -216,21 +231,8 @@ def test_variable_types(): assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode -def test_make_entityset_default_args(): - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} - df = pd.DataFrame(dic) - deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], - 'jersey_num': [['player_name', 'team']], - 'player_name': [['team', 'jersey_num']], - 'city': [['team'], ['state'], ['player_name', 'jersey_num']], - 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) - normalized_entityset = autonormalize.make_entityset(df, deps) +def test_make_entityset_default_args(teams_input): + normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps()) dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', @@ -242,7 +244,7 @@ def test_make_entityset_default_args(): 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], - 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + 'state': ['TX', 'MA', 'IL', 'HI']} assert len(normalized_entityset.entities) == 3 @@ -264,24 +266,11 @@ def test_make_entityset_default_args(): assert normalized_entityset.entities[2].variable_types['state'] == Categorical -def test_make_entityset_custom_args(): - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} - df = pd.DataFrame(dic) - deps = classes.Dependencies({'team': [['player_name', 'jersey_num']], - 'jersey_num': [['player_name', 'team']], - 'player_name': [['team', 'jersey_num']], - 'city': [['team'], ['state'], ['player_name', 'jersey_num']], - 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num']) - normalized_entityset = autonormalize.make_entityset(df=df, - dependencies=deps, - name='Sport', - variable_types={'state': SubRegionCode}) +def test_make_entityset_custom_args(teams_input): + normalized_entityset = autonormalize.make_entityset(df=teams_input.get_df(), + dependencies=teams_input.get_deps(), + name='Teams', + variable_types={'state': Text}) dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', @@ -293,10 +282,10 @@ def test_make_entityset_custom_args(): 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], - 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + 'state': ['TX', 'MA', 'IL', 'HI']} assert len(normalized_entityset.entities) == 3 - assert normalized_entityset.id == 'Sport' + assert normalized_entityset.id == 'Teams' assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) assert normalized_entityset.entities[1].df.equals(pd.DataFrame( @@ -313,19 +302,11 @@ def test_make_entityset_custom_args(): assert normalized_entityset.entities[1].variable_types['city'] == Id assert normalized_entityset.entities[2].variable_types['city'] == Index - assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode + assert normalized_entityset.entities[2].variable_types['state'] == Text -def test_auto_entityset_default_args(): - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} - df = pd.DataFrame(dic) - normalized_entityset = autonormalize.auto_entityset(df) +def test_auto_entityset_default_args(teams_input): + normalized_entityset = autonormalize.auto_entityset(teams_input.get_df()) dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', @@ -337,7 +318,7 @@ def test_auto_entityset_default_args(): 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], - 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + 'state': ['TX', 'MA', 'IL', 'HI']} assert len(normalized_entityset.entities) == 3 @@ -359,20 +340,12 @@ def test_auto_entityset_default_args(): assert normalized_entityset.entities[2].variable_types['state'] == Categorical -def test_auto_entityset_custom_args(): - dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', - 'Yellow', 'Green', 'Green', 'Blue'], - 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], - 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], - 'city': ['boston', 'boston', 'boston', 'chicago', 'chicago', - 'honolulu', 'honolulu', 'boston', 'boston', 'austin'], - 'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']} - df = pd.DataFrame(dic) - normalized_entityset = autonormalize.auto_entityset(df=df, - name='Sport', - variable_types={'state': SubRegionCode}) +def test_auto_entityset_custom_args(teams_input): + normalized_entityset = autonormalize.auto_entityset(df=teams_input.get_df(), + name='Teams', + variable_types={'state': Text}) - dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow', 'Green', 'Green', 'Blue'], 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], @@ -382,10 +355,10 @@ def test_auto_entityset_custom_args(): 'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']} dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'], - 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]} + 'state': ['TX', 'MA', 'IL', 'HI']} assert len(normalized_entityset.entities) == 3 - assert normalized_entityset.id == 'Sport' + assert normalized_entityset.id == 'Teams' assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one)) assert normalized_entityset.entities[1].df.equals(pd.DataFrame( @@ -402,4 +375,4 @@ def test_auto_entityset_custom_args(): assert normalized_entityset.entities[1].variable_types['city'] == Id assert normalized_entityset.entities[2].variable_types['city'] == Index - assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode + assert normalized_entityset.entities[2].variable_types['state'] == Text From 0d96a1bf148d23e17631116a881659fe9f4b3953 Mon Sep 17 00:00:00 2001 From: Jaskirat Grover Date: Sat, 18 Apr 2020 15:01:10 +1000 Subject: [PATCH 6/6] Changing new index names to be alphabetical for teams example --- autonormalize/tests/test_normalize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py index a323ecf..d71133e 100644 --- a/autonormalize/tests/test_normalize.py +++ b/autonormalize/tests/test_normalize.py @@ -234,7 +234,7 @@ def test_variable_types(): def test_make_entityset_default_args(teams_input): normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps()) - dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow', 'Green', 'Green', 'Blue'], 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], @@ -254,7 +254,7 @@ def test_make_entityset_default_args(teams_input): assert normalized_entityset.entities[2].df.equals(pd.DataFrame( dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) - assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index assert normalized_entityset.entities[0].variable_types['team'] == Id assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical @@ -272,7 +272,7 @@ def test_make_entityset_custom_args(teams_input): name='Teams', variable_types={'state': Text}) - dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow', 'Green', 'Green', 'Blue'], 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], @@ -293,7 +293,7 @@ def test_make_entityset_custom_args(teams_input): assert normalized_entityset.entities[2].df.equals(pd.DataFrame( dic_three, index=['austin', 'boston', 'chicago', 'honolulu'])) - assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index + assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index assert normalized_entityset.entities[0].variable_types['team'] == Id assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical