@@ -867,6 +867,48 @@ def test_iter(self):
867
867
data .append (col )
868
868
self .assertEqual (data , columns )
869
869
870
+ def test_drop_duplicates (self ):
871
+ tbl = self .table
872
+ df = self .get_cars_df
873
+ # drop duplicates for single subset
874
+ tbl_dropped = tbl .drop_duplicates (casout = {'replace' :True }, subset = 'Make' )
875
+ df_dropped = df .drop_duplicates (subset = 'Make' )
876
+
877
+ # Equivalent to pandas in size
878
+ self .assertEquals (len (tbl_dropped ), len (df_dropped ))
879
+ # Number of elements in 'Make' column should be same as number of unique elements
880
+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ['Make' ]))
881
+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ))
882
+
883
+ # drop duplicates for multi-element subset
884
+ tbl_dropped_multi = tbl .drop_duplicates (casout = {'replace' :True }, subset = ['Country' , 'Type' ])
885
+ df_dropped_multi = df .drop_duplicates (subset = ['Country' , 'Type' ])
886
+
887
+ # Equivalent to pandas in size
888
+ self .assertEquals (len (tbl_dropped_multi ), len (df_dropped_multi ))
889
+
890
+ # We need some rows where all values for each col are duplicate
891
+ nDuplicates = 7
892
+ fetchTable = self .s .fetch (table = self .table , to = nDuplicates )['Fetch' ]
893
+ # Really wants to convert char to varChar, we need to specify our way out of this
894
+ subset = self .s .upload_frame (fetchTable , casout = {'replace' :True },
895
+ importOptions = {'fileType' :'CSV' ,
896
+ 'vars' :[{'name' :'Make' , 'type' :'CHAR' , 'length' :13 },
897
+ {'name' :'Model' , 'type' :'CHAR' , 'length' :40 },
898
+ {'name' :'Type' , 'type' :'CHAR' , 'length' :8 },
899
+ {'name' :'Origin' , 'type' :'CHAR' , 'length' :6 },
900
+ {'name' :'DriveTrain' , 'type' :'CHAR' , 'length' :5 }
901
+ ]})
902
+ # This table is like tbl, but with nDuplicate fully duplicate rows
903
+ duplicate_table = tbl .append (subset )
904
+
905
+ # Drop duplicates without subset (checks all cols)
906
+ tbl_dropped_all = duplicate_table .drop_duplicates (casout = {'replace' :True })
907
+
908
+ # Make sure that the correct amount of rows were dropped
909
+ self .assertEquals (len (tbl ), len (tbl_dropped_all ))
910
+ self .assertEquals (len (duplicate_table ), len (tbl_dropped_all ) + nDuplicates )
911
+
870
912
def test_column_iter (self ):
871
913
df = self .get_cars_df ()
872
914
tbl = self .table
0 commit comments