Skip to content

Commit 0c55fef

Browse files
committed
updating docstrings to include information about how keys are chosen, indexes created, and updated readME
1 parent 1184162 commit 0c55fef

File tree

7 files changed

+65
-379
lines changed

7 files changed

+65
-379
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,12 @@ Returns:
6060
```shell
6161
normalize_dataframe(df, dependencies)
6262
```
63-
Normalizes dataframe based on the dependencies given.
63+
Normalizes dataframe based on the dependencies given. Keys for the newly created DataFrames can only be columns that are strings, ints, or categories. Keys are chosen according to the priority:
64+
1) shortest lenghts
65+
2) has "id" in some form in the name of an attribute
66+
3) has attribute furthest to left in the table
6467

65-
Returns:
68+
Returns:x
6669

6770
`new_dfs` (list[pd.DataFrame]) : list of new dataframes
6871

@@ -71,7 +74,7 @@ Returns:
7174
```shell
7275
make_entityset(df, dependencies, name=None, time_index=None):
7376
```
74-
Creates a normalized EntitySet from dataframe based on the dependencies given.
77+
Creates a normalized EntitySet from dataframe based on the dependencies given. Keys are chosen in the same fashion as for `normalize_dataframe`and a new index will be created if any key has more than a single attribute.
7578

7679
Returns:
7780

autonormalize/.DS_Store

0 Bytes
Binary file not shown.

autonormalize/autonormalize.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from .classes import Dependencies
55

66

7-
def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
7+
def find_dependencies(df, accuracy=0.98, index=None):
88
"""
99
Finds dependencies within dataframe df with the DFD search algorithm.
1010
Returns the dependencies as a Dependencies object.
@@ -17,19 +17,14 @@ def find_dependencies(df, accuracy=0.98, rep_percent=0.85, index=None):
1717
required in order to conclude a dependency (i.e. with accuracy = 0.98,
1818
0.98 of the rows must hold true the dependency LHS --> RHS)
1919
20-
rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
21-
data that may be unique in order to determine a dependency (i.e. with
22-
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
23-
in LHS + RHS, no dependency will be concluded.)
24-
2520
index (str, optional) : name of column that is intended index of df
2621
2722
Returns:
2823
2924
dependencies (Dependencies) : the dependencies found in the data
3025
within the contraints provided
3126
"""
32-
deps = Dependencies(dfd.dfd(df, accuracy, rep_percent, index))
27+
deps = Dependencies(dfd.dfd(df, accuracy, index))
3328
if index is None:
3429
prim_key = normalize.choose_index(deps.find_candidate_keys(), df)
3530
deps.set_prim_key(prim_key)
@@ -57,7 +52,11 @@ def normalize_dependencies(df, dependencies):
5752

5853
def normalize_dataframe(df, dependencies):
5954
"""
60-
Normalizes a dataframe based on the dependencies given.
55+
Normalizes a dataframe based on the dependencies given. Keys for the newly
56+
created DataFrames can only be columns that are strings, ints, or
57+
categories. Keys are chosen according to the priority:
58+
1) shortest lenghts 2) has "id" in some form in the name of an attribute
59+
3) has attribute furthest to left in the table
6160
6261
Arguments:
6362
df (pd.DataFrame) : dataframe to split up
@@ -74,6 +73,10 @@ def normalize_dataframe(df, dependencies):
7473
def make_entityset(df, dependencies, name=None, time_index=None):
7574
"""
7675
Creates a normalized EntitySet from df based on the dependencies given.
76+
Keys for the newly created DataFrames can only be columns that are strings,
77+
ints, or categories. Keys are chosen according to the priority:
78+
1) shortest lenghts 2) has "id" in some form in the name of an attribute
79+
3) has attribute furthest to left in the table
7780
7881
Arguments:
7982
df (pd.DataFrame) : dataframe to normalize and make entity set from
@@ -107,7 +110,7 @@ def make_entityset(df, dependencies, name=None, time_index=None):
107110
return ft.EntitySet(name, entities, relationships)
108111

109112

110-
def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, time_index=None):
113+
def auto_entityset(df, accuracy=0.98, index=None, name=None, time_index=None):
111114
"""
112115
Creates a normalized entityset from a dataframe.
113116
@@ -119,11 +122,6 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
119122
required in order to conclude a dependency (i.e. with accuracy = 0.98,
120123
0.98 of the rows must hold true the dependency LHS --> RHS)
121124
122-
rep_percent (0 < float <= 1.00; default = 0.85) : the maximum amount of
123-
data that may be unique in order to determine a dependency (i.e. with
124-
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
125-
in LHS + RHS, no dependency will be concluded.)
126-
127125
index (str, optional) : name of column that is intended index of df
128126
129127
name (str, optional) : the name of created EntitySet
@@ -134,7 +132,7 @@ def auto_entityset(df, accuracy=0.98, rep_percent=0.85, index=None, name=None, t
134132
135133
entityset (ft.EntitySet) : created entity set
136134
"""
137-
return make_entityset(df, find_dependencies(df, accuracy, rep_percent, index), name, time_index)
135+
return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
138136

139137

140138
def auto_normalize(df):

autonormalize/demos/AutoNormalize + FeatureTools Demo.ipynb

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
},
1717
{
1818
"cell_type": "code",
19-
"execution_count": 2,
19+
"execution_count": 1,
2020
"metadata": {},
2121
"outputs": [],
2222
"source": [
@@ -39,7 +39,7 @@
3939
},
4040
{
4141
"cell_type": "code",
42-
"execution_count": 3,
42+
"execution_count": 2,
4343
"metadata": {
4444
"scrolled": true
4545
},
@@ -146,7 +146,7 @@
146146
"2 1973-07-28 A "
147147
]
148148
},
149-
"execution_count": 3,
149+
"execution_count": 2,
150150
"metadata": {},
151151
"output_type": "execute_result"
152152
}
@@ -174,7 +174,7 @@
174174
},
175175
{
176176
"cell_type": "code",
177-
"execution_count": 6,
177+
"execution_count": 3,
178178
"metadata": {},
179179
"outputs": [
180180
{
@@ -254,7 +254,7 @@
254254
"3 1 2014-01-03 18:39:30 2834.44"
255255
]
256256
},
257-
"execution_count": 6,
257+
"execution_count": 3,
258258
"metadata": {},
259259
"output_type": "execute_result"
260260
}
@@ -278,7 +278,7 @@
278278
},
279279
{
280280
"cell_type": "code",
281-
"execution_count": 7,
281+
"execution_count": 4,
282282
"metadata": {},
283283
"outputs": [
284284
{
@@ -351,7 +351,7 @@
351351
"3 1 2014-01-03 17:39:30 True"
352352
]
353353
},
354-
"execution_count": 7,
354+
"execution_count": 4,
355355
"metadata": {},
356356
"output_type": "execute_result"
357357
}
@@ -364,7 +364,7 @@
364364
},
365365
{
366366
"cell_type": "code",
367-
"execution_count": 8,
367+
"execution_count": 5,
368368
"metadata": {
369369
"scrolled": false
370370
},
@@ -419,7 +419,7 @@
419419
},
420420
{
421421
"cell_type": "code",
422-
"execution_count": 9,
422+
"execution_count": 6,
423423
"metadata": {
424424
"scrolled": true
425425
},
@@ -428,7 +428,7 @@
428428
"name": "stderr",
429429
"output_type": "stream",
430430
"text": [
431-
"100%|██████████| 10/10 [00:13<00:00, 1.41s/it]\n"
431+
"100%|██████████| 10/10 [00:01<00:00, 7.11it/s]\n"
432432
]
433433
},
434434
{
@@ -449,7 +449,7 @@
449449
}
450450
],
451451
"source": [
452-
"es = an.auto_entityset(transaction_df, name=\"transactions\", time_index='transaction_time')\n",
452+
"es = an.auto_entityset(transaction_df, accuracy=1, name=\"transactions\", time_index='transaction_time')\n",
453453
"es.add_last_time_indexes()\n",
454454
"print(es)"
455455
]
@@ -643,15 +643,7 @@
643643
"cell_type": "markdown",
644644
"metadata": {},
645645
"source": [
646-
"\n",
647-
"\n",
648-
"*write*\n",
649-
"\n",
650-
"\n",
651-
"\n",
652-
"In the feature matrix, let’s extract the labels and fill any missing values with zeros. Then, one-hot encode all categorical features by using encode_features().\n",
653-
"\n",
654-
"After preprocessing, we split the features and corresponding labels each into training and testing sets."
646+
"Now we preprocess our features, and split the features and corresponding labels into training and testing sets."
655647
]
656648
},
657649
{
@@ -672,6 +664,13 @@
672664
")"
673665
]
674666
},
667+
{
668+
"cell_type": "markdown",
669+
"metadata": {},
670+
"source": [
671+
"Now, we train a random forest classifer on the training set, and then test the models performance by evaluating predictions on the testing set."
672+
]
673+
},
675674
{
676675
"cell_type": "code",
677676
"execution_count": 10,
@@ -702,7 +701,9 @@
702701
{
703702
"cell_type": "code",
704703
"execution_count": 11,
705-
"metadata": {},
704+
"metadata": {
705+
"scrolled": true
706+
},
706707
"outputs": [
707708
{
708709
"name": "stdout",
@@ -725,6 +726,13 @@
725726
"print(classification_report(y_test, y_hat))"
726727
]
727728
},
729+
{
730+
"cell_type": "markdown",
731+
"metadata": {},
732+
"source": [
733+
"This plot is based on scores obtained by the model to illustrate which features are considered important for predictions."
734+
]
735+
},
728736
{
729737
"cell_type": "code",
730738
"execution_count": 14,

autonormalize/dfd.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# run script.py to see a couple examples
1111

1212

13-
def dfd(df, accuracy, rep_percent, index=None):
13+
def dfd(df, accuracy, index=None):
1414
"""
1515
Main loop of DFD algorithm. It returns all the dependencies represented
1616
in the data in dataframe df. Refer to section 3.2 of paper for literature.
@@ -28,11 +28,6 @@ def dfd(df, accuracy, rep_percent, index=None):
2828
to conclude a dependency (i.e. with accuracy = 0.98, 0.98 of the rows
2929
must hold true the dependency LHS --> RHS)
3030
31-
rep_percent (0 < float <= 1.00) : the maximum amount of
32-
data that may be unique in order to determine a dependency (i.e. with
33-
rep_percent = 0.85, if less than 15% of rows are repeated for the columns
34-
in LHS + RHS, no dependency will be concluded.)
35-
3631
Returns:
3732
3833
minimal_dependencies (DfdDependencies) : the minimal dependencies
@@ -49,12 +44,12 @@ def dfd(df, accuracy, rep_percent, index=None):
4944
non_uniq.remove(i)
5045
dependencies.add_unique_lhs(i)
5146
for i in tqdm(non_uniq):
52-
lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks, rep_percent)
47+
lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks)
5348
dependencies.add_LHSs(i, lhss)
5449
return dependencies
5550

5651

57-
def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
52+
def find_LHSs(rhs, attrs, df, partitions, accuracy, masks):
5853
"""
5954
Finds all LHS sets of attributes that satisfy a dependency relation for the
6055
RHS attribute i. This is such that LHS --> RHS.
@@ -76,11 +71,6 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
7671
7772
masks (Masks) : contains past calculated masks
7873
79-
rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
80-
unique in order to determine a dependency (i.e. with rep_percent = 0.85,
81-
if less than 15% of rows are repeated for the columns in LHS + RHS, no
82-
dependency will be concluded.)
83-
8474
Returns:
8575
lhss (LHSs) : all the LHS that determine rhs
8676
"""
@@ -106,7 +96,7 @@ def find_LHSs(rhs, attrs, df, partitions, accuracy, masks, rep_percent):
10696
else:
10797
node.infer_type()
10898
if node.category == 0:
109-
if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks, rep_percent):
99+
if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks):
110100
if node.is_minimal():
111101
min_deps.add_dep(node.attrs)
112102
node.category = 2
@@ -296,7 +286,7 @@ def generate_next_seeds(max_non_deps, min_deps, lhs_attrs):
296286
return list(seeds)
297287

298288

299-
def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percent):
289+
def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks):
300290
"""
301291
Returns true if lhs_set --> rhs for dataframe df.
302292
@@ -318,17 +308,12 @@ def compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks, rep_percen
318308
319309
masks (Masks) : contains past calculated masks
320310
321-
rep_percent (0 < float <= 1.00) : the maximum amount of data that may be
322-
unique in order to determine a dependency (i.e. with rep_percent = 0.85,
323-
if less than 15% of rows are repeated for the columns in LHS + RHS, no
324-
dependency will be concluded.)
325-
326311
Returns:
327312
is_dependency (bool) : True if is a dependency, false otherwise
328313
"""
329314
# for approximate dependencies see TANE section 2.3s
330315
if accuracy < 1:
331-
return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks, rep_percent)
316+
return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks)
332317
part_rhs = partition(lhs_set.union(set([rhs])), df, partitions)
333318
# if part_rhs > df.shape[0] * rep_percent:
334319
# return False
@@ -347,7 +332,7 @@ def partition(attrs, df, partitions):
347332
return shape
348333

349334

350-
def approximate_dependencies(lhs_set, rhs, df, accuracy, masks, rep_percent):
335+
def approximate_dependencies(lhs_set, rhs, df, accuracy, masks):
351336
"""
352337
Checks whether the columns represented in lhs_set functionally determines the column rhs
353338
for the dataframe df.

0 commit comments

Comments
 (0)