Skip to content

Commit 22387dd

Browse files
authored
Merge pull request #163 from mjerick25/drop_duplicates
Implemented drop_duplicates method for CASTable
2 parents 4ec8019 + 3fdc53a commit 22387dd

File tree

2 files changed

+99
-2
lines changed

2 files changed

+99
-2
lines changed

swat/cas/table.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5880,8 +5880,54 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
58805880
out._sortby = list(self._sortby)
58815881
return out
58825882

5883-
# def drop_duplicates(self, *args, **kwargs):
5884-
# raise NotImplementedError
5883+
def drop_duplicates(self, casout, subset=[]):
5884+
'''
5885+
Remove duplicate rows from a CASTable. Optionally, consider only
5886+
a subset of columns when checking for duplicate rows.
5887+
5888+
Parameters
5889+
--------
5890+
casout : string or :class:`CASTable` or dict
5891+
The output table.
5892+
subset : string or list-of-strings, optional
5893+
The subset of columns to consider when checking for duplicate rows.
5894+
5895+
Returns
5896+
--------
5897+
:class:`CASTable`
5898+
The input table without duplicate rows.
5899+
'''
5900+
self._loadactionset('deduplication')
5901+
5902+
cols = [x for x in list(self.columns)]
5903+
# Determine what columns/combo of columns we are looking for duplicates
5904+
if not subset:
5905+
# Subset empty -> we look in all columns for duplicates
5906+
for col in cols:
5907+
subset.append(col)
5908+
else:
5909+
# If subset is just a string, iteration will be through characters
5910+
if isinstance(subset, six.string_types):
5911+
subset = [subset]
5912+
# Determine if all provided columns in subset are in the table
5913+
for col in subset:
5914+
if col not in cols:
5915+
raise ValueError("Provided column " + col + " is not in the table.")
5916+
5917+
# We run this aciton to drop duplicates from the original table
5918+
# It is not returned -> we have to manually grab results from casout
5919+
self.groupby(subset)._retrieve('deduplication.deduplicate',
5920+
casout=casout, noDuplicateKeys=True)
5921+
5922+
# Fetch the output table
5923+
if isinstance(casout, CASTable):
5924+
out = casout
5925+
elif isinstance(casout, dict):
5926+
out = self.get_connection().CASTable(**casout)
5927+
else:
5928+
out = self.get_connection().CASTable(casout)
5929+
5930+
return out
58855931

58865932
# def duplicated(self, *args, **kwargs):
58875933
# raise NotImplementedError

swat/tests/cas/test_table.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,57 @@ def test_iter(self):
867867
data.append(col)
868868
self.assertEqual(data, columns)
869869

870+
def test_drop_duplicates(self):
871+
# pull in table as CASTable and as pandas DataFrame
872+
tbl = self.table
873+
df = self.get_cars_df()
874+
# drop duplicates for single subset
875+
tbl_dropped = tbl.drop_duplicates(casout={'replace': True,
876+
'name': 'drop-test-1'},
877+
subset='Make')
878+
df_dropped = df.drop_duplicates(subset='Make')
879+
880+
# Equivalent to pandas in size
881+
self.assertEquals(len(tbl_dropped), len(df_dropped))
882+
# Number of elements in 'Make' column should be same as number of unique elements
883+
self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped['Make']))
884+
self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped))
885+
886+
# drop duplicates for multi-element subset
887+
tbl_dropped_multi = tbl.drop_duplicates(casout={'replace': True,
888+
'name': 'drop-test-2'},
889+
subset=['Origin', 'Type'])
890+
df_dropped_multi = df.drop_duplicates(subset=['Origin', 'Type'])
891+
892+
# Equivalent to pandas in size
893+
self.assertEquals(len(tbl_dropped_multi), len(df_dropped_multi))
894+
895+
# We need some rows where all values for each col are duplicate
896+
nDuplicates = 7
897+
fetchTable = self.s.fetch(table=self.table, to=nDuplicates)['Fetch']
898+
# Must specify char type and explicit length
899+
importOptions = {'fileType': 'CSV',
900+
'vars': [{'name': 'Make', 'type': 'CHAR', 'length': 13},
901+
{'name': 'Model', 'type': 'CHAR', 'length': 40},
902+
{'name': 'Type', 'type': 'CHAR', 'length': 8},
903+
{'name': 'Origin', 'type': 'CHAR', 'length': 6},
904+
{'name': 'DriveTrain', 'type': 'CHAR', 'length': 5}
905+
]}
906+
subset = self.s.upload_frame(fetchTable, casout={'replace': True,
907+
'name': 'drop-test-3'},
908+
importOptions=importOptions)
909+
910+
# This table is like tbl, but with nDuplicate fully duplicate rows
911+
duplicate_table = tbl.append(subset)
912+
913+
# Drop duplicates without subset (checks all cols)
914+
tbl_dropped_all = duplicate_table.drop_duplicates(casout={'replace': True,
915+
'name': 'drop-test-4'})
916+
917+
# Make sure that the correct amount of rows were dropped
918+
self.assertEquals(len(tbl), len(tbl_dropped_all))
919+
self.assertEquals(len(duplicate_table), len(tbl_dropped_all) + nDuplicates)
920+
870921
def test_column_iter(self):
871922
df = self.get_cars_df()
872923
tbl = self.table

0 commit comments

Comments
 (0)