@@ -867,6 +867,57 @@ def test_iter(self):
867
867
data .append (col )
868
868
self .assertEqual (data , columns )
869
869
870
+ def test_drop_duplicates (self ):
871
+ # pull in table as CASTable and as pandas DataFrame
872
+ tbl = self .table
873
+ df = self .get_cars_df ()
874
+ # drop duplicates for single subset
875
+ tbl_dropped = tbl .drop_duplicates (casout = {'replace' : True ,
876
+ 'name' : 'drop-test-1' },
877
+ subset = 'Make' )
878
+ df_dropped = df .drop_duplicates (subset = 'Make' )
879
+
880
+ # Equivalent to pandas in size
881
+ self .assertEquals (len (tbl_dropped ), len (df_dropped ))
882
+ # Number of elements in 'Make' column should be same as number of unique elements
883
+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ['Make' ]))
884
+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ))
885
+
886
+ # drop duplicates for multi-element subset
887
+ tbl_dropped_multi = tbl .drop_duplicates (casout = {'replace' : True ,
888
+ 'name' : 'drop-test-2' },
889
+ subset = ['Origin' , 'Type' ])
890
+ df_dropped_multi = df .drop_duplicates (subset = ['Origin' , 'Type' ])
891
+
892
+ # Equivalent to pandas in size
893
+ self .assertEquals (len (tbl_dropped_multi ), len (df_dropped_multi ))
894
+
895
+ # We need some rows where all values for each col are duplicate
896
+ nDuplicates = 7
897
+ fetchTable = self .s .fetch (table = self .table , to = nDuplicates )['Fetch' ]
898
+ # Must specify char type and explicit length
899
+ importOptions = {'fileType' : 'CSV' ,
900
+ 'vars' : [{'name' : 'Make' , 'type' : 'CHAR' , 'length' : 13 },
901
+ {'name' : 'Model' , 'type' : 'CHAR' , 'length' : 40 },
902
+ {'name' : 'Type' , 'type' : 'CHAR' , 'length' : 8 },
903
+ {'name' : 'Origin' , 'type' : 'CHAR' , 'length' : 6 },
904
+ {'name' : 'DriveTrain' , 'type' : 'CHAR' , 'length' : 5 }
905
+ ]}
906
+ subset = self .s .upload_frame (fetchTable , casout = {'replace' : True ,
907
+ 'name' : 'drop-test-3' },
908
+ importOptions = importOptions )
909
+
910
+ # This table is like tbl, but with nDuplicate fully duplicate rows
911
+ duplicate_table = tbl .append (subset )
912
+
913
+ # Drop duplicates without subset (checks all cols)
914
+ tbl_dropped_all = duplicate_table .drop_duplicates (casout = {'replace' : True ,
915
+ 'name' : 'drop-test-4' })
916
+
917
+ # Make sure that the correct amount of rows were dropped
918
+ self .assertEquals (len (tbl ), len (tbl_dropped_all ))
919
+ self .assertEquals (len (duplicate_table ), len (tbl_dropped_all ) + nDuplicates )
920
+
870
921
def test_column_iter (self ):
871
922
df = self .get_cars_df ()
872
923
tbl = self .table
0 commit comments