Skip to content

Commit 19f8f87

Browse files
author
Michael Erickson
committed
Added tests for drop_duplicates
1 parent ffc348e commit 19f8f87

File tree

1 file changed

+42
-0
lines changed

1 file changed

+42
-0
lines changed

swat/tests/cas/test_table.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,48 @@ def test_iter(self):
867867
data.append(col)
868868
self.assertEqual(data, columns)
869869

870+
def test_drop_duplicates(self):
871+
tbl = self.table
872+
df = self.get_cars_df
873+
# drop duplicates for single subset
874+
tbl_dropped = tbl.drop_duplicates(casout={'replace':True}, subset='Make')
875+
df_dropped = df.drop_duplicates(subset='Make')
876+
877+
# Equivalent to pandas in size
878+
self.assertEquals(len(tbl_dropped), len(df_dropped))
879+
# Number of elements in 'Make' column should be same as number of unique elements
880+
self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped['Make']))
881+
self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped))
882+
883+
# drop duplicates for multi-element subset
884+
tbl_dropped_multi = tbl.drop_duplicates(casout={'replace':True}, subset=['Country', 'Type'])
885+
df_dropped_multi = df.drop_duplicates(subset=['Country', 'Type'])
886+
887+
# Equivalent to pandas in size
888+
self.assertEquals(len(tbl_dropped_multi), len(df_dropped_multi))
889+
890+
# We need some rows where all values for each col are duplicate
891+
nDuplicates = 7
892+
fetchTable = self.s.fetch(table=self.table, to=nDuplicates)['Fetch']
893+
# Really wants to convert char to varChar, we need to specify our way out of this
894+
subset = self.s.upload_frame(fetchTable, casout={'replace':True},
895+
importOptions={'fileType':'CSV',
896+
'vars':[{'name':'Make', 'type':'CHAR', 'length':13},
897+
{'name':'Model', 'type':'CHAR', 'length':40},
898+
{'name':'Type', 'type':'CHAR', 'length':8},
899+
{'name':'Origin', 'type':'CHAR', 'length':6},
900+
{'name':'DriveTrain', 'type':'CHAR', 'length':5}
901+
]})
902+
# This table is like tbl, but with nDuplicate fully duplicate rows
903+
duplicate_table = tbl.append(subset)
904+
905+
# Drop duplicates without subset (checks all cols)
906+
tbl_dropped_all = duplicate_table.drop_duplicates(casout={'replace':True})
907+
908+
# Make sure that the correct amount of rows were dropped
909+
self.assertEquals(len(tbl), len(tbl_dropped_all))
910+
self.assertEquals(len(duplicate_table), len(tbl_dropped_all) + nDuplicates)
911+
870912
def test_column_iter(self):
871913
df = self.get_cars_df()
872914
tbl = self.table

0 commit comments

Comments
 (0)