Improve performance of reading files with duplicate column names (#955)

wentasah · web-flow · commit 482a187d2091 · 2021-12-24T11:47:06.000-07:00
* Add tests for duplicate column name handling

In the next commit, we'll be changing the code responsible for naming
duplicate columns and these tests should ensure that the behavior
doesn't change.

* Improve performance of reading files with duplicate column names

I need to load a file with 30k columns, 10k of these have the same
name. Currently, this is practically impossible because makeunique(),
which produces unique column names, has cubic complexity.

This commit changes the algorithm to use a Dict to quickly look up the
existence of columns and to cache the next numeric suffix used to
uniquify column names.

Care has been taken to ensure that columns are named the same way as
before. To that extent, additional tests were added in the previous
commit.
diff --git a/src/utils.jl b/src/utils.jl
@@ -349,17 +349,20 @@ function makeunique(names)
     set = Set(names)
     length(set) == length(names) && return Symbol[Symbol(x) for x in names]
     nms = Symbol[]
+    nextsuffix = Dict{eltype(names), UInt}()
     for nm in names
-        if nm in nms
-            k = 1
+        if haskey(nextsuffix, nm)
+            k = nextsuffix[nm]
             newnm = Symbol("$(nm)_$k")
-            while newnm in set || newnm in nms
+            while newnm in set || haskey(nextsuffix, newnm)
                 k += 1
                 newnm = Symbol("$(nm)_$k")
             end
+            nextsuffix[nm] = k + 1
             nm = newnm
         end
         push!(nms, nm)
+        nextsuffix[nm] = 1
     end
     @assert length(names) == length(nms)
     return nms
diff --git a/test/basics.jl b/test/basics.jl
@@ -748,4 +748,14 @@ f = CSV.File(IOBuffer("a,b\n1,2\n3,"))
 @test f.a == [1, 3]
 @test isequal(f.b, [2, missing])
 
+# duplicate column names
+f = CSV.File(IOBuffer("a,a,a\n"))
+@test f.names == [:a, :a_1, :a_2]
+
+f = CSV.File(IOBuffer("a,a_1,a\n"))
+@test f.names == [:a, :a_1, :a_2]
+
+f = CSV.File(IOBuffer("a,a,a_1\n")) # this case is not covered in test_duplicate_columnnames.csv
+@test f.names == [:a, :a_2, :a_1]
+
 end