@@ -867,6 +867,57 @@ def test_iter(self):
867867 data .append (col )
868868 self .assertEqual (data , columns )
869869
870+ def test_drop_duplicates (self ):
871+ # pull in table as CASTable and as pandas DataFrame
872+ tbl = self .table
873+ df = self .get_cars_df ()
874+ # drop duplicates for single subset
875+ tbl_dropped = tbl .drop_duplicates (casout = {'replace' : True ,
876+ 'name' : 'drop-test-1' },
877+ subset = 'Make' )
878+ df_dropped = df .drop_duplicates (subset = 'Make' )
879+
880+ # Equivalent to pandas in size
881+ self .assertEquals (len (tbl_dropped ), len (df_dropped ))
882+ # Number of elements in 'Make' column should be same as number of unique elements
883+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ['Make' ]))
884+ self .assertEquals (tbl_dropped ['Make' ].nunique (), len (tbl_dropped ))
885+
886+ # drop duplicates for multi-element subset
887+ tbl_dropped_multi = tbl .drop_duplicates (casout = {'replace' : True ,
888+ 'name' : 'drop-test-2' },
889+ subset = ['Origin' , 'Type' ])
890+ df_dropped_multi = df .drop_duplicates (subset = ['Origin' , 'Type' ])
891+
892+ # Equivalent to pandas in size
893+ self .assertEquals (len (tbl_dropped_multi ), len (df_dropped_multi ))
894+
895+ # We need some rows where all values for each col are duplicate
896+ nDuplicates = 7
897+ fetchTable = self .s .fetch (table = self .table , to = nDuplicates )['Fetch' ]
898+ # Must specify char type and explicit length
899+ importOptions = {'fileType' : 'CSV' ,
900+ 'vars' : [{'name' : 'Make' , 'type' : 'CHAR' , 'length' : 13 },
901+ {'name' : 'Model' , 'type' : 'CHAR' , 'length' : 40 },
902+ {'name' : 'Type' , 'type' : 'CHAR' , 'length' : 8 },
903+ {'name' : 'Origin' , 'type' : 'CHAR' , 'length' : 6 },
904+ {'name' : 'DriveTrain' , 'type' : 'CHAR' , 'length' : 5 }
905+ ]}
906+ subset = self .s .upload_frame (fetchTable , casout = {'replace' : True ,
907+ 'name' : 'drop-test-3' },
908+ importOptions = importOptions )
909+
910+ # This table is like tbl, but with nDuplicate fully duplicate rows
911+ duplicate_table = tbl .append (subset )
912+
913+ # Drop duplicates without subset (checks all cols)
914+ tbl_dropped_all = duplicate_table .drop_duplicates (casout = {'replace' : True ,
915+ 'name' : 'drop-test-4' })
916+
917+ # Make sure that the correct amount of rows were dropped
918+ self .assertEquals (len (tbl ), len (tbl_dropped_all ))
919+ self .assertEquals (len (duplicate_table ), len (tbl_dropped_all ) + nDuplicates )
920+
870921 def test_column_iter (self ):
871922 df = self .get_cars_df ()
872923 tbl = self .table
0 commit comments