From 561b33f3747ee4a829af56b87d939c25371a1969 Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Mon, 30 Sep 2019 12:32:54 +1000
Subject: [PATCH 1/6] Adding variable types to autonormalize

---
 autonormalize/autonormalize.py        | 27 ++++++++++++++++++++-------
 autonormalize/tests/test_normalize.py | 19 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/autonormalize/autonormalize.py b/autonormalize/autonormalize.py
index 945f959..8ee290c 100644
--- a/autonormalize/autonormalize.py
+++ b/autonormalize/autonormalize.py
@@ -70,7 +70,7 @@ def normalize_dataframe(df, dependencies):
     return depdf.return_dfs()
 
 
-def make_entityset(df, dependencies, name=None, time_index=None):
+def make_entityset(df, dependencies, name=None, time_index=None, variable_types=None):
     """
     Creates a normalized EntitySet from df based on the dependencies given.
     Keys for the newly created DataFrames can only be columns that are strings,
@@ -82,6 +82,10 @@ def make_entityset(df, dependencies, name=None, time_index=None):
         df (pd.DataFrame) : dataframe to normalize and make entity set from
         dependencies (Dependenies) : the dependencies discovered in df
         name (str, optional) : the name of created EntitySet
+        time_index (str, optional) : name of time column in the dataframe
+        variable_types (dict[str -> Variable], optional):
+            Keys are of variable ids and values are variable types. Used to
+            initialize an entity's store.
 
     Returns:
         entityset (ft.EntitySet) : created entity set
@@ -97,10 +101,14 @@ def make_entityset(df, dependencies, name=None, time_index=None):
 
     while stack != []:
         current = stack.pop()
+        if variable_types is not None:
+            entity_variable_types = {col: variable_types[col] for col in current.df.columns if col in variable_types}
+        else:
+            entity_variable_types = None
         if time_index in current.df.columns:
-            entities[current.index[0]] = (current.df, current.index[0], time_index)
+            entities[current.index[0]] = (current.df, current.index[0], time_index, entity_variable_types)
         else:
-            entities[current.index[0]] = (current.df, current.index[0])
+            entities[current.index[0]] = (current.df, current.index[0], None, entity_variable_types)
         for child in current.children:
             # add to stack
             # add relationship
@@ -110,7 +118,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
     return ft.EntitySet(name, entities, relationships)
 
 
-def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
+def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None):
     """
     Creates a normalized entityset from a dataframe.
 
@@ -126,13 +134,17 @@ def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
 
         name (str, optional) : the name of created EntitySet
 
-        time_index (str, optional) : name of time column in the dataframe.
+        time_index (str, optional) : name of time column in the dataframe
+
+        variable_types (dict[str -> Variable], optional):
+            Keys are of variable ids and values are variable types. Used to
+            initialize an entity's store
 
     Returns:
 
         entityset (ft.EntitySet) : created entity set
     """
-    return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
+    return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index, variable_types)
 
 
 def auto_normalize(df):
@@ -169,5 +181,6 @@ def normalize_entity(es, accuracy=0.98):
     if len(es.entities) == 0:
         raise ValueError('This EntitySet is empty')
     entity = es.entities[0]
-    new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index)
+    new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index,
+                            variable_types=entity.variable_types)
     return new_es
diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
index 4956d82..e104fe1 100644
--- a/autonormalize/tests/test_normalize.py
+++ b/autonormalize/tests/test_normalize.py
@@ -1,7 +1,11 @@
 import pandas as pd
+import featuretools as ft
+
+from featuretools.variable_types import ZIPCode
 from pandas.util.testing import assert_frame_equal
 
-from autonormalize import classes, normalize
+from autonormalize import classes, normalize, autonormalize
+
 
 # from classes import Dependencies
 
@@ -178,3 +182,16 @@ def test_make_indexes():
     assert new_dfs[0][new_dfs[1].columns[0]][5] == val
     assert new_dfs[0][new_dfs[1].columns[0]][6] == val
     assert new_dfs[0][new_dfs[1].columns[0]][7] == val
+
+
+def test_variable_types():
+    df = ft.demo.load_mock_customer(n_customers=20, n_products=12, n_sessions=50,
+                                    n_transactions=100, return_single_table=True)
+    entityset = ft.EntitySet()
+    entityset.entity_from_dataframe(entity_id='Customer Transactions',
+                                    dataframe=df,
+                                    time_index='transaction_time',
+                                    variable_types={"zip_code": ZIPCode})
+
+    normalized_entityset = autonormalize.normalize_entity(entityset)
+    assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode

From 05ab955b0686513a482961660dd72eb6c4e7a91c Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Thu, 10 Oct 2019 15:41:18 +1100
Subject: [PATCH 2/6] Update README.md

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 22f669a..d54d5a6 100755
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ pip uninstall autonormalize
 
 ### `auto_entityset`
 ```shell
-auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None)
+auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None)
 ```
 Creates a normalized entityset from a dataframe.
 
@@ -49,8 +49,10 @@ Creates a normalized entityset from a dataframe.
 
 * `name` (str, optional) : the name of created EntitySet
 
-* `time_index` (str, optional) : name of time column in the dataframe.
+* `time_index` (str, optional) : name of time column in the dataframe
 
+* `variable_types` (dict[str -> Variable], optional) : Keys are of variable ids and values are variable types. Used to initialize an entity's store.
+            
 **Returns:**
 
 * `entityset` (ft.EntitySet) : created entity set
@@ -85,7 +87,7 @@ Normalizes dataframe based on the dependencies given. Keys for the newly created
 ### `make_entityset`
 
 ```shell
-make_entityset(df, dependencies, name=None, time_index=None)
+make_entityset(df, dependencies, name=None, time_index=None, variable_types=None)
 ```
 Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute.
 

From db8257a42c809d119af2ca27aae1e7cb4ebc1180 Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Sun, 15 Mar 2020 16:37:55 +1100
Subject: [PATCH 3/6] Testing variable types of all columns in normalized
 entityset

---
 autonormalize/tests/test_normalize.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
index e104fe1..db42947 100644
--- a/autonormalize/tests/test_normalize.py
+++ b/autonormalize/tests/test_normalize.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import featuretools as ft
 
-from featuretools.variable_types import ZIPCode
+from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id
 from pandas.util.testing import assert_frame_equal
 
 from autonormalize import classes, normalize, autonormalize
@@ -194,4 +194,22 @@ def test_variable_types():
                                     variable_types={"zip_code": ZIPCode})
 
     normalized_entityset = autonormalize.normalize_entity(entityset)
+
+    assert normalized_entityset['transaction_id'].variable_types['transaction_id'] == Index
+    assert normalized_entityset['transaction_id'].variable_types['session_id'] == Id
+    assert normalized_entityset['transaction_id'].variable_types['transaction_time'] == DatetimeTimeIndex
+    assert normalized_entityset['transaction_id'].variable_types['product_id'] == Id
+    assert normalized_entityset['transaction_id'].variable_types['amount'] == Numeric
+
+    assert normalized_entityset['product_id'].variable_types['product_id'] == Index
+    assert normalized_entityset['product_id'].variable_types['brand'] == Categorical
+
+    assert normalized_entityset['session_id'].variable_types['session_id'] == Index
+    assert normalized_entityset['session_id'].variable_types['customer_id'] == Id
+    assert normalized_entityset['session_id'].variable_types['device'] == Categorical
+    assert normalized_entityset['session_id'].variable_types['session_start'] == Datetime
+
+    assert normalized_entityset['customer_id'].variable_types['customer_id'] == Index
+    assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime
+    assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime
     assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode

From 46c2a192e444d220afeaa195a540f1c41a626d9a Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Mon, 13 Apr 2020 14:48:46 +1000
Subject: [PATCH 4/6] Initial tests for make_entityset and auto_entityset

---
 autonormalize/tests/test_normalize.py | 194 +++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
index db42947..6007035 100644
--- a/autonormalize/tests/test_normalize.py
+++ b/autonormalize/tests/test_normalize.py
@@ -1,7 +1,8 @@
 import pandas as pd
 import featuretools as ft
 
-from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id
+from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id, \
+    SubRegionCode
 from pandas.util.testing import assert_frame_equal
 
 from autonormalize import classes, normalize, autonormalize
@@ -191,7 +192,7 @@ def test_variable_types():
     entityset.entity_from_dataframe(entity_id='Customer Transactions',
                                     dataframe=df,
                                     time_index='transaction_time',
-                                    variable_types={"zip_code": ZIPCode})
+                                    variable_types={'zip_code': ZIPCode})
 
     normalized_entityset = autonormalize.normalize_entity(entityset)
 
@@ -213,3 +214,192 @@ def test_variable_types():
     assert normalized_entityset['customer_id'].variable_types['join_date'] == Datetime
     assert normalized_entityset['customer_id'].variable_types['date_of_birth'] == Datetime
     assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode
+
+
+def test_make_entityset_default_args():
+    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                    'Yellow', 'Green', 'Green', 'Blue'],
+           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
+           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
+                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
+           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
+    df = pd.DataFrame(dic)
+    deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
+                                 'jersey_num': [['player_name', 'team']],
+                                 'player_name': [['team', 'jersey_num']],
+                                 'city': [['team'], ['state'], ['player_name', 'jersey_num']],
+                                 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])
+    normalized_entityset = autonormalize.make_entityset(df, deps)
+
+    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                        'Yellow', 'Green', 'Green', 'Blue'],
+               'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+               'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}
+
+    dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
+               'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
+
+    dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
+                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+
+    assert len(normalized_entityset.entities) == 3
+
+    assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
+    assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
+        dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
+    assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
+        dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
+
+    assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index
+    assert normalized_entityset.entities[0].variable_types['team'] == Id
+    assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
+    assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical
+
+    assert normalized_entityset.entities[1].variable_types['team'] == Index
+    assert normalized_entityset.entities[1].variable_types['city'] == Id
+
+    assert normalized_entityset.entities[2].variable_types['city'] == Index
+    assert normalized_entityset.entities[2].variable_types['state'] == Categorical
+
+
+def test_make_entityset_custom_args():
+    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                    'Yellow', 'Green', 'Green', 'Blue'],
+           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
+           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
+                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
+           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
+    df = pd.DataFrame(dic)
+    deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
+                                 'jersey_num': [['player_name', 'team']],
+                                 'player_name': [['team', 'jersey_num']],
+                                 'city': [['team'], ['state'], ['player_name', 'jersey_num']],
+                                 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])
+    normalized_entityset = autonormalize.make_entityset(df=df,
+                                                        dependencies=deps,
+                                                        name='Sport',
+                                                        variable_types={'state': SubRegionCode})
+
+    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                        'Yellow', 'Green', 'Green', 'Blue'],
+               'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+               'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}
+
+    dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
+               'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
+
+    dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
+                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+
+    assert len(normalized_entityset.entities) == 3
+    assert normalized_entityset.id == 'Sport'
+
+    assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
+    assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
+        dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
+    assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
+        dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
+
+    assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index
+    assert normalized_entityset.entities[0].variable_types['team'] == Id
+    assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
+    assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical
+
+    assert normalized_entityset.entities[1].variable_types['team'] == Index
+    assert normalized_entityset.entities[1].variable_types['city'] == Id
+
+    assert normalized_entityset.entities[2].variable_types['city'] == Index
+    assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode
+
+
+def test_auto_entityset_default_args():
+    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                    'Yellow', 'Green', 'Green', 'Blue'],
+           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
+           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
+                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
+           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
+    df = pd.DataFrame(dic)
+    normalized_entityset = autonormalize.auto_entityset(df)
+
+    dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                        'Yellow', 'Green', 'Green', 'Blue'],
+               'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+               'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}
+
+    dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
+               'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
+
+    dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
+                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+
+    assert len(normalized_entityset.entities) == 3
+
+    assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
+    assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
+        dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
+    assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
+        dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
+
+    assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
+    assert normalized_entityset.entities[0].variable_types['team'] == Id
+    assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
+    assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical
+
+    assert normalized_entityset.entities[1].variable_types['team'] == Index
+    assert normalized_entityset.entities[1].variable_types['city'] == Id
+
+    assert normalized_entityset.entities[2].variable_types['city'] == Index
+    assert normalized_entityset.entities[2].variable_types['state'] == Categorical
+
+
+def test_auto_entityset_custom_args():
+    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                    'Yellow', 'Green', 'Green', 'Blue'],
+           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
+           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
+                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
+           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
+    df = pd.DataFrame(dic)
+    normalized_entityset = autonormalize.auto_entityset(df=df,
+                                                        name='Sport',
+                                                        variable_types={'state': SubRegionCode})
+
+    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+               'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                        'Yellow', 'Green', 'Green', 'Blue'],
+               'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+               'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']}
+
+    dic_two = {'team': ['Blue', 'Green', 'Orange', 'Red', 'Yellow'],
+               'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
+
+    dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
+                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+
+    assert len(normalized_entityset.entities) == 3
+    assert normalized_entityset.id == 'Sport'
+
+    assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
+    assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
+        dic_two, index=['Blue', 'Green', 'Orange', 'Red', 'Yellow']))
+    assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
+        dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
+
+    assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
+    assert normalized_entityset.entities[0].variable_types['team'] == Id
+    assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
+    assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical
+
+    assert normalized_entityset.entities[1].variable_types['team'] == Index
+    assert normalized_entityset.entities[1].variable_types['city'] == Id
+
+    assert normalized_entityset.entities[2].variable_types['city'] == Index
+    assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode

From 5a66f942457a7c31d78a80c8f57a92a32e6aa1f6 Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Sat, 18 Apr 2020 14:59:06 +1000
Subject: [PATCH 5/6] Adding pytest fixture for teams example

---
 autonormalize/tests/test_normalize.py | 147 +++++++++++---------------
 1 file changed, 60 insertions(+), 87 deletions(-)

diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
index 6007035..a323ecf 100644
--- a/autonormalize/tests/test_normalize.py
+++ b/autonormalize/tests/test_normalize.py
@@ -1,17 +1,47 @@
 import pandas as pd
-import featuretools as ft
-
-from featuretools.variable_types import ZIPCode, Index, Datetime, Numeric, DatetimeTimeIndex, Categorical, Id, \
-    SubRegionCode
+import pytest
 from pandas.util.testing import assert_frame_equal
 
-from autonormalize import classes, normalize, autonormalize
-
+import featuretools as ft
+from featuretools.variable_types import (
+    Categorical,
+    Datetime,
+    DatetimeTimeIndex,
+    Id,
+    Index,
+    Numeric,
+    Text,
+    ZIPCode
+)
+
+from autonormalize import autonormalize, classes, normalize
 
 # from classes import Dependencies
 
 # from normalize import normalize, find_most_comm, split_on_dep
 
+@pytest.fixture
+def teams_input():
+    class Teams:
+        def get_df(self):
+            dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
+                            'Yellow', 'Green', 'Green', 'Blue'],
+                   'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
+                   'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
+                   'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
+                            'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
+                   'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']}
+            return pd.DataFrame(dic)
+
+        def get_deps(self):
+            return classes.Dependencies({'team': [['player_name', 'jersey_num']],
+                                         'jersey_num': [['player_name', 'team']],
+                                         'player_name': [['team', 'jersey_num']],
+                                         'city': [['team'], ['state'], ['player_name', 'jersey_num']],
+                                         'state': [['team'], ['player_name', 'jersey_num'],
+                                                   ['city']]}, ['team', 'jersey_num'])
+    return Teams()
+
 
 def test_normalize():
     # how to test that relations remain the same???
@@ -105,23 +135,8 @@ def test_choose_index():
     assert normalize.choose_index(keys, df) == ['A', 'B']
 
 
-def test_normalize_dataframe():
-
-    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
-                    'Yellow', 'Green', 'Green', 'Blue'],
-           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
-           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
-           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
-                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
-           'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']}
-    df = pd.DataFrame(dic)
-    deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
-                                 'jersey_num': [['player_name', 'team']],
-                                 'player_name': [['team', 'jersey_num']],
-                                 'city': [['team'], ['state'], ['player_name', 'jersey_num']],
-                                 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])
-
-    depdf = normalize.DepDF(deps, df, deps.get_prim_key())
+def test_normalize_dataframe(teams_input):
+    depdf = normalize.DepDF(teams_input.get_deps(), teams_input.get_df(), teams_input.get_deps().get_prim_key())
     normalize.normalize_dataframe(depdf)
     new_dfs = depdf.return_dfs()
 
@@ -216,21 +231,8 @@ def test_variable_types():
     assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode
 
 
-def test_make_entityset_default_args():
-    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
-                    'Yellow', 'Green', 'Green', 'Blue'],
-           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
-           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
-           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
-                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
-           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
-    df = pd.DataFrame(dic)
-    deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
-                                 'jersey_num': [['player_name', 'team']],
-                                 'player_name': [['team', 'jersey_num']],
-                                 'city': [['team'], ['state'], ['player_name', 'jersey_num']],
-                                 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])
-    normalized_entityset = autonormalize.make_entityset(df, deps)
+def test_make_entityset_default_args(teams_input):
+    normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps())
 
     dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
@@ -242,7 +244,7 @@ def test_make_entityset_default_args():
                'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
 
     dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
-                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+                 'state': ['TX', 'MA', 'IL', 'HI']}
 
     assert len(normalized_entityset.entities) == 3
 
@@ -264,24 +266,11 @@ def test_make_entityset_default_args():
     assert normalized_entityset.entities[2].variable_types['state'] == Categorical
 
 
-def test_make_entityset_custom_args():
-    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
-                    'Yellow', 'Green', 'Green', 'Blue'],
-           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
-           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
-           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
-                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
-           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
-    df = pd.DataFrame(dic)
-    deps = classes.Dependencies({'team': [['player_name', 'jersey_num']],
-                                 'jersey_num': [['player_name', 'team']],
-                                 'player_name': [['team', 'jersey_num']],
-                                 'city': [['team'], ['state'], ['player_name', 'jersey_num']],
-                                 'state': [['team'], ['player_name', 'jersey_num'], ['city']]}, ['team', 'jersey_num'])
-    normalized_entityset = autonormalize.make_entityset(df=df,
-                                                        dependencies=deps,
-                                                        name='Sport',
-                                                        variable_types={'state': SubRegionCode})
+def test_make_entityset_custom_args(teams_input):
+    normalized_entityset = autonormalize.make_entityset(df=teams_input.get_df(),
+                                                        dependencies=teams_input.get_deps(),
+                                                        name='Teams',
+                                                        variable_types={'state': Text})
 
     dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
@@ -293,10 +282,10 @@ def test_make_entityset_custom_args():
                'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
 
     dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
-                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+                 'state': ['TX', 'MA', 'IL', 'HI']}
 
     assert len(normalized_entityset.entities) == 3
-    assert normalized_entityset.id == 'Sport'
+    assert normalized_entityset.id == 'Teams'
 
     assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
     assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
@@ -313,19 +302,11 @@ def test_make_entityset_custom_args():
     assert normalized_entityset.entities[1].variable_types['city'] == Id
 
     assert normalized_entityset.entities[2].variable_types['city'] == Index
-    assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode
+    assert normalized_entityset.entities[2].variable_types['state'] == Text
 
 
-def test_auto_entityset_default_args():
-    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
-                    'Yellow', 'Green', 'Green', 'Blue'],
-           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
-           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
-           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
-                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
-           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
-    df = pd.DataFrame(dic)
-    normalized_entityset = autonormalize.auto_entityset(df)
+def test_auto_entityset_default_args(teams_input):
+    normalized_entityset = autonormalize.auto_entityset(teams_input.get_df())
 
     dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
@@ -337,7 +318,7 @@ def test_auto_entityset_default_args():
                'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
 
     dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
-                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+                 'state': ['TX', 'MA', 'IL', 'HI']}
 
     assert len(normalized_entityset.entities) == 3
 
@@ -359,20 +340,12 @@ def test_auto_entityset_default_args():
     assert normalized_entityset.entities[2].variable_types['state'] == Categorical
 
 
-def test_auto_entityset_custom_args():
-    dic = {'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
-                    'Yellow', 'Green', 'Green', 'Blue'],
-           'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
-           'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
-           'city': ['boston', 'boston', 'boston', 'chicago', 'chicago',
-                    'honolulu', 'honolulu', 'boston', 'boston', 'austin'],
-           'state': ['US-MA', 'US-MA', 'US-MA', 'US-IL', 'US-IL', 'US-HI', 'US-HI', 'US-MA', 'US-MA', 'US-TX']}
-    df = pd.DataFrame(dic)
-    normalized_entityset = autonormalize.auto_entityset(df=df,
-                                                        name='Sport',
-                                                        variable_types={'state': SubRegionCode})
+def test_auto_entityset_custom_args(teams_input):
+    normalized_entityset = autonormalize.auto_entityset(df=teams_input.get_df(),
+                                                        name='Teams',
+                                                        variable_types={'state': Text})
 
-    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+    dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
                         'Yellow', 'Green', 'Green', 'Blue'],
                'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
@@ -382,10 +355,10 @@ def test_auto_entityset_custom_args():
                'city': ['austin', 'boston', 'chicago', 'boston', 'honolulu']}
 
     dic_three = {'city': ['austin', 'boston', 'chicago', 'honolulu'],
-                 'state': ['US-TX', 'US-MA', 'US-IL', 'US-HI', ]}
+                 'state': ['TX', 'MA', 'IL', 'HI']}
 
     assert len(normalized_entityset.entities) == 3
-    assert normalized_entityset.id == 'Sport'
+    assert normalized_entityset.id == 'Teams'
 
     assert normalized_entityset.entities[0].df.equals(pd.DataFrame(dic_one))
     assert normalized_entityset.entities[1].df.equals(pd.DataFrame(
@@ -402,4 +375,4 @@ def test_auto_entityset_custom_args():
     assert normalized_entityset.entities[1].variable_types['city'] == Id
 
     assert normalized_entityset.entities[2].variable_types['city'] == Index
-    assert normalized_entityset.entities[2].variable_types['state'] == SubRegionCode
+    assert normalized_entityset.entities[2].variable_types['state'] == Text

From 0d96a1bf148d23e17631116a881659fe9f4b3953 Mon Sep 17 00:00:00 2001
From: Jaskirat Grover <jaskirat23@hotmail.com>
Date: Sat, 18 Apr 2020 15:01:10 +1000
Subject: [PATCH 6/6] Changing new index names to be alphabetical for teams
 example

---
 autonormalize/tests/test_normalize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/autonormalize/tests/test_normalize.py b/autonormalize/tests/test_normalize.py
index a323ecf..d71133e 100644
--- a/autonormalize/tests/test_normalize.py
+++ b/autonormalize/tests/test_normalize.py
@@ -234,7 +234,7 @@ def test_variable_types():
 def test_make_entityset_default_args(teams_input):
     normalized_entityset = autonormalize.make_entityset(teams_input.get_df(), teams_input.get_deps())
 
-    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+    dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
                         'Yellow', 'Green', 'Green', 'Blue'],
                'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
@@ -254,7 +254,7 @@ def test_make_entityset_default_args(teams_input):
     assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
         dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
 
-    assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index
+    assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
     assert normalized_entityset.entities[0].variable_types['team'] == Id
     assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
     assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical
@@ -272,7 +272,7 @@ def test_make_entityset_custom_args(teams_input):
                                                         name='Teams',
                                                         variable_types={'state': Text})
 
-    dic_one = {'team_jersey_num': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+    dic_one = {'jersey_num_team': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                'team': ['Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow',
                         'Yellow', 'Green', 'Green', 'Blue'],
                'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
@@ -293,7 +293,7 @@ def test_make_entityset_custom_args(teams_input):
     assert normalized_entityset.entities[2].df.equals(pd.DataFrame(
         dic_three, index=['austin', 'boston', 'chicago', 'honolulu']))
 
-    assert normalized_entityset.entities[0].variable_types['team_jersey_num'] == Index
+    assert normalized_entityset.entities[0].variable_types['jersey_num_team'] == Index
     assert normalized_entityset.entities[0].variable_types['team'] == Id
     assert normalized_entityset.entities[0].variable_types['jersey_num'] == Numeric
     assert normalized_entityset.entities[0].variable_types['player_name'] == Categorical