Skip to content

Commit 86f5eff

Browse files
author
Jaskirat Grover
committed
Adding variable types to autonormalize
1 parent 18b87ca commit 86f5eff

File tree

2 files changed

+38
-8
lines changed

2 files changed

+38
-8
lines changed

autonormalize/autonormalize.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def normalize_dataframe(df, dependencies):
7070
return depdf.return_dfs()
7171

7272

73-
def make_entityset(df, dependencies, name=None, time_index=None):
73+
def make_entityset(df, dependencies, name=None, time_index=None, variable_types=None):
7474
"""
7575
Creates a normalized EntitySet from df based on the dependencies given.
7676
Keys for the newly created DataFrames can only be columns that are strings,
@@ -82,6 +82,10 @@ def make_entityset(df, dependencies, name=None, time_index=None):
8282
df (pd.DataFrame) : dataframe to normalize and make entity set from
8383
dependencies (Dependenies) : the dependencies discovered in df
8484
name (str, optional) : the name of created EntitySet
85+
time_index (str, optional) : name of time column in the dataframe
86+
variable_types (dict[str -> Variable], optional):
87+
Keys are of variable ids and values are variable types. Used to
88+
initialize an entity's store.
8589
8690
Returns:
8791
entityset (ft.EntitySet) : created entity set
@@ -97,10 +101,14 @@ def make_entityset(df, dependencies, name=None, time_index=None):
97101

98102
while stack != []:
99103
current = stack.pop()
104+
if variable_types is not None:
105+
entity_variable_types = {col: variable_types[col] for col in current.df.columns if col in variable_types}
106+
else:
107+
entity_variable_types = None
100108
if time_index in current.df.columns:
101-
entities[current.index[0]] = (current.df, current.index[0], time_index)
109+
entities[current.index[0]] = (current.df, current.index[0], time_index, entity_variable_types)
102110
else:
103-
entities[current.index[0]] = (current.df, current.index[0])
111+
entities[current.index[0]] = (current.df, current.index[0], None, entity_variable_types)
104112
for child in current.children:
105113
# add to stack
106114
# add relationship
@@ -110,7 +118,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
110118
return ft.EntitySet(name, entities, relationships)
111119

112120

113-
def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
121+
def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None, variable_types=None):
114122
"""
115123
Creates a normalized entityset from a dataframe.
116124
@@ -126,13 +134,17 @@ def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
126134
127135
name (str, optional) : the name of created EntitySet
128136
129-
time_index (str, optional) : name of time column in the dataframe.
137+
time_index (str, optional) : name of time column in the dataframe
138+
139+
variable_types (dict[str -> Variable], optional):
140+
Keys are of variable ids and values are variable types. Used to
141+
initialize an entity's store
130142
131143
Returns:
132144
133145
entityset (ft.EntitySet) : created entity set
134146
"""
135-
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
147+
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index, variable_types)
136148

137149

138150
def auto_normalize(df):
@@ -169,5 +181,6 @@ def normalize_entity(es, accuracy=0.98):
169181
if len(es.entities) == 0:
170182
raise ValueError('This EntitySet is empty')
171183
entity = es.entities[0]
172-
new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index)
184+
new_es = auto_entityset(entity.df, accuracy, index=entity.index, name=es.id, time_index=entity.time_index,
185+
variable_types=entity.variable_types)
173186
return new_es

autonormalize/tests/test_normalize.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import pandas as pd
2+
import featuretools as ft
3+
4+
from featuretools.variable_types import ZIPCode
25
from pandas.util.testing import assert_frame_equal
36

4-
from autonormalize import classes, normalize
7+
from autonormalize import classes, normalize, autonormalize
8+
59

610
# from classes import Dependencies
711

@@ -178,3 +182,16 @@ def test_make_indexes():
178182
assert new_dfs[0][new_dfs[1].columns[0]][5] == val
179183
assert new_dfs[0][new_dfs[1].columns[0]][6] == val
180184
assert new_dfs[0][new_dfs[1].columns[0]][7] == val
185+
186+
187+
def test_variable_types():
188+
df = ft.demo.load_mock_customer(n_customers=20, n_products=12, n_sessions=50,
189+
n_transactions=100, return_single_table=True)
190+
entityset = ft.EntitySet()
191+
entityset.entity_from_dataframe(entity_id='Customer Transactions',
192+
dataframe=df,
193+
time_index='transaction_time',
194+
variable_types={"zip_code": ZIPCode})
195+
196+
normalized_entityset = autonormalize.normalize_entity(entityset)
197+
assert normalized_entityset['customer_id'].variable_types['zip_code'] == ZIPCode

0 commit comments

Comments
 (0)