Merge pull request #163 from mjerick25/drop_duplicates

bkemper24 · web-flow · commit 22387dd4af51 · 2023-07-13T08:17:09.000-04:00
Implemented drop_duplicates method for CASTable
diff --git a/swat/cas/table.py b/swat/cas/table.py
@@ -5880,8 +5880,54 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
         out._sortby = list(self._sortby)
         return out
 
-#   def drop_duplicates(self, *args, **kwargs):
-#       raise NotImplementedError
+    def drop_duplicates(self, casout, subset=[]):
+        '''
+        Remove duplicate rows from a CASTable. Optionally, consider only
+        a subset of columns when checking for duplicate rows.
+
+        Parameters
+        --------
+        casout : string or :class:`CASTable` or dict
+            The output table.
+        subset : string or list-of-strings, optional
+            The subset of columns to consider when checking for duplicate rows.
+
+        Returns
+        --------
+        :class:`CASTable`
+            The input table without duplicate rows.
+        '''
+        self._loadactionset('deduplication')
+
+        cols = [x for x in list(self.columns)]
+        # Determine what columns/combo of columns we are looking for duplicates
+        if not subset:
+            # Subset empty -> we look in all columns for duplicates
+            for col in cols:
+                subset.append(col)
+        else:
+            # If subset is just a string, iteration will be through characters
+            if isinstance(subset, six.string_types):
+                subset = [subset]
+            # Determine if all provided columns in subset are in the table
+            for col in subset:
+                if col not in cols:
+                    raise ValueError("Provided column " + col + " is not in the table.")
+
+        # We run this aciton to drop duplicates from the original table
+        # It is not returned -> we have to manually grab results from casout
+        self.groupby(subset)._retrieve('deduplication.deduplicate',
+                                       casout=casout, noDuplicateKeys=True)
+
+        # Fetch the output table
+        if isinstance(casout, CASTable):
+            out = casout
+        elif isinstance(casout, dict):
+            out = self.get_connection().CASTable(**casout)
+        else:
+            out = self.get_connection().CASTable(casout)
+
+        return out
 
 #   def duplicated(self, *args, **kwargs):
 #       raise NotImplementedError
diff --git a/swat/tests/cas/test_table.py b/swat/tests/cas/test_table.py
@@ -867,6 +867,57 @@ def test_iter(self):
             data.append(col)
         self.assertEqual(data, columns)
 
+    def test_drop_duplicates(self):
+        # pull in table as CASTable and as pandas DataFrame
+        tbl = self.table
+        df = self.get_cars_df()
+        # drop duplicates for single subset
+        tbl_dropped = tbl.drop_duplicates(casout={'replace': True,
+                                                  'name': 'drop-test-1'},
+                                          subset='Make')
+        df_dropped = df.drop_duplicates(subset='Make')
+
+        # Equivalent to pandas in size
+        self.assertEquals(len(tbl_dropped), len(df_dropped))
+        # Number of elements in 'Make' column should be same as number of unique elements
+        self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped['Make']))
+        self.assertEquals(tbl_dropped['Make'].nunique(), len(tbl_dropped))
+
+        # drop duplicates for multi-element subset
+        tbl_dropped_multi = tbl.drop_duplicates(casout={'replace': True,
+                                                        'name': 'drop-test-2'},
+                                                subset=['Origin', 'Type'])
+        df_dropped_multi = df.drop_duplicates(subset=['Origin', 'Type'])
+
+        # Equivalent to pandas in size
+        self.assertEquals(len(tbl_dropped_multi), len(df_dropped_multi))
+
+        # We need some rows where all values for each col are duplicate
+        nDuplicates = 7
+        fetchTable = self.s.fetch(table=self.table, to=nDuplicates)['Fetch']
+        # Must specify char type and explicit length
+        importOptions = {'fileType': 'CSV',
+                         'vars': [{'name': 'Make', 'type': 'CHAR', 'length': 13},
+                                  {'name': 'Model', 'type': 'CHAR', 'length': 40},
+                                  {'name': 'Type', 'type': 'CHAR', 'length': 8},
+                                  {'name': 'Origin', 'type': 'CHAR', 'length': 6},
+                                  {'name': 'DriveTrain', 'type': 'CHAR', 'length': 5}
+                                  ]}
+        subset = self.s.upload_frame(fetchTable, casout={'replace': True,
+                                                         'name': 'drop-test-3'},
+                                     importOptions=importOptions)
+
+        # This table is like tbl, but with nDuplicate fully duplicate rows
+        duplicate_table = tbl.append(subset)
+
+        # Drop duplicates without subset (checks all cols)
+        tbl_dropped_all = duplicate_table.drop_duplicates(casout={'replace': True,
+                                                                  'name': 'drop-test-4'})
+
+        # Make sure that the correct amount of rows were dropped
+        self.assertEquals(len(tbl), len(tbl_dropped_all))
+        self.assertEquals(len(duplicate_table), len(tbl_dropped_all) + nDuplicates)
+
     def test_column_iter(self):
         df = self.get_cars_df()
         tbl = self.table