Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions autonormalize/dfd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial
from itertools import combinations

import numpy
import pandas as pd
from tqdm import tqdm

from .classes import DfdDependencies, LHSs, Masks, Node
Expand Down Expand Up @@ -359,7 +359,6 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
acc = 0

for index, row in indicator.iterrows():

mask = None
for attr in lhs_set:

Expand All @@ -368,14 +367,19 @@ def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
if df[attr].dtypes.name == 'datetime64[ns]':
m = df[attr] == row[attr]
else:
m = df[attr].values == row[attr]
if pd.isna(row[attr]):
m = df[attr].isnull()
else:
m = df[attr].values == row[attr]
masks.add_mask(attr, row[attr], m)
if mask is None:
mask = m
else:
mask = mask & m
options = df[mask]
_, unique_counts = numpy.unique(options[rhs].to_numpy(), return_counts=True)

# _, unique_counts = np.unique(options[rhs].to_numpy(), return_counts=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this line be removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, removed this and another temporary test that I added trying to replicate the problem that was present in #19.

unique_counts = options[rhs].value_counts()
acc += unique_counts.sum() - unique_counts.max()
if acc > limit:
return False
Expand Down
6 changes: 4 additions & 2 deletions autonormalize/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def make_indexes(depdf):
Arguments:
depdf (DepDF) : depDF to make indexes for
"""

prim_key = depdf.deps.get_prim_key()

if len(prim_key) > 1:
Expand Down Expand Up @@ -103,8 +104,9 @@ def make_indexes(depdf):

for index in indices[name]:
add[index] = new_val

depdf.parent.df.drop(columns=prim_key, inplace=True)
# Don't drop a column if it is needed in another parent relationship
to_drop = [key for key in prim_key if key not in depdf.parent.deps.serialize().keys()]
depdf.parent.df.drop(columns=to_drop, inplace=True)
depdf.parent.df.insert(len(depdf.parent.df.columns), '_'.join(prim_key), add)

for child in depdf.children:
Expand Down
44 changes: 26 additions & 18 deletions autonormalize/tests/test_dfd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import numpy as np
import pandas as pd

from autonormalize import dfd
Expand Down Expand Up @@ -73,21 +74,28 @@ def test_compute_partitions():
assert not dfd.compute_partitions(df, 'c', frozenset(['a', 'b']), {}, 0.96, mask)


# def test_approximate_dependencies():
# mask = dfd.Masks(['a', 'b', 'c'])
# a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
# # b = [int(x%2 == 0) for x in a]
# b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
# # c = [(a[i] + b[i])<4 for i in range(40)]
# c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, 1.00, mask, 0.90)
# assert dfd.approximate_dependencies(set([0, 1]), 2, df, .90, mask, 0.90)
# c[0] = True
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, .97, mask, 0.90)
# assert not dfd.approximate_dependencies(set([0, 1]), 2, df, .98, mask, 0.90)
# c[35] = False
# df = pd.DataFrame({'a': a, 'b': b, 'c': c})
# assert dfd.approximate_dependencies([0, 1], 2, df, .95, mask, 0.90)
# assert not dfd.approximate_dependencies([0, 1], 2, df, .96, mask, 0.90)
def test_approximate_dependencies():
mask = dfd.Masks(['a', 'b', 'c'])
a = [6, 2, 3, 7, 8, 1, 0, 2, 0, 3, 6, 0, 4, 6, 8, 7, 6, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, 6, 4, 6, 8]
b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
c = [False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 1.00, mask)
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .90, mask)
c[0] = True
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .97, mask)
assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .98, mask)
c[35] = False
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, .95, mask)
assert not dfd.approximate_dependencies(['a', 'b'], 'c', df, .96, mask)


def test_approximate_dependencies_with_nan():
mask = dfd.Masks(['a', 'b', 'c'])
a = [np.nan, 2, 3, 7, 8, 1, 0, 2, 0, 3, np.nan, 0, 4, np.nan, 8, 7, np.nan, 8, 1, 5, 1, 3, 3, 0, 0, 4, 5, 5, 7, 0, 8, 2, 4, 7, 0, 0, np.nan, 4, np.nan, 8]
b = [1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]
c = [True, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, False, False, False, True, False, True, False, False, True, True, False, False, False, False]
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
assert dfd.approximate_dependencies(['a', 'b'], 'c', df, 0.9, mask)
48 changes: 48 additions & 0 deletions autonormalize/tests/test_normalize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal

import autonormalize as an
from autonormalize import classes, normalize


Expand Down Expand Up @@ -178,3 +180,49 @@ def test_make_indexes():
# Make sure new column names are sorted
assert 'hemisphere_month' in new_dfs[0].columns
assert 'hemisphere_month' in new_dfs[1].columns


def test_make_indexes_improper_column_drop():
df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}
df = pd.DataFrame(df_dict)

deps = classes.Dependencies({'Id': [['LotArea']],
'MSSubClass': [['LotArea'], ['LotFrontage', 'Utilities'], ['Id']],
'MSZoning': [['LotFrontage'], ['LotArea'], ['MSSubClass'], ['Id']],
'LotFrontage': [['LotArea'], ['Id']], 'LotArea': [['Id']],
'Alley': [['LotFrontage'], ['LandContour'], ['Utilities'], ['MSSubClass'], ['Id'], ['MSZoning'], ['LotArea'], ['LotShape']],
'LotShape': [['LotFrontage'], ['MSSubClass', 'Utilities', 'LandContour'], ['LotArea'], ['Id']],
'LandContour': [['LotFrontage'], ['MSSubClass', 'LotShape'], ['LotArea'], ['Id']],
'Utilities': [['MSSubClass', 'LotShape'], ['LotArea'], ['MSSubClass', 'LotFrontage'], ['Id']]}, ['id'])

depdf = normalize.DepDF(deps, df, deps.get_prim_key())
normalize.normalize_dataframe(depdf)
normalize.make_indexes(depdf)
new_dfs = depdf.return_dfs()

assert 'MSSubClass' in new_dfs[0].columns


def test_issue19():
df_dict = {'Id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'MSSubClass': [90, 60, 90, 90, 20, 50, 80, 20, 60, 20, 20],
'MSZoning': ['RL', 'RL', 'RL', 'RL', 'RL', 'RM', 'RL', 'RL', 'RL', 'RL', 'RL'],
'LotFrontage': [55.0, np.nan, 42.0, 100.0, np.nan, 98.0, 70.0, 85.0, 65.0, 78.0, 60.0],
'LotArea': [12640, 8755, 7711, 25000, 14375, 8820, 8163, 14536, 14006, 9360, 7200],
'Alley': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
'LotShape': ['IR1', 'IR1', 'IR1', 'Reg', 'IR1', 'Reg', 'Reg', 'Reg', 'IR1', 'Reg', 'Reg'],
'LandContour': ['Lvl', 'Lvl', 'Lvl', 'Low', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl', 'Lvl'],
'Utilities': ['AllPub', 'AllPub', 'AllPub', 'AllPub', 'NoSeWa', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub', 'AllPub']}

df = pd.DataFrame(df_dict)

es = an.auto_entityset(df, accuracy=1.0, name="es")
print(es)