Reduce size of multi-threading enablement to 100_000 (#3274)

bkamins · web-flow · commit 436b6865ba3d · 2023-02-05T17:31:41.000+01:00
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -212,9 +212,9 @@ mutable struct DataFrame <: AbstractDataFrame
 
         # we write into columns as we know that it is guaranteed
         # that it was freshly allocated in the outer constructor
-        if copycols && len >= 1_000_000 && length(columns) > 1 && Threads.nthreads() > 1
+        if copycols && len >= 100_000 && length(columns) > 1 && Threads.nthreads() > 1
             @sync for i in eachindex(columns)
-                Threads.@spawn columns[i] = _preprocess_column(columns[i], len, copycols)
+                @spawn columns[i] = _preprocess_column(columns[i], len, copycols)
             end
         else
             for i in eachindex(columns)
@@ -558,10 +558,10 @@ function _threaded_getindex(selected_rows::AbstractVector,
                             selected_columns::AbstractVector,
                             df_columns::AbstractVector,
                             idx::AbstractIndex)
-    if length(selected_rows) >= 1_000_000 && Threads.nthreads() > 1
+    if length(selected_rows) >= 100_000 && Threads.nthreads() > 1
         new_columns = Vector{AbstractVector}(undef, length(selected_columns))
         @sync for i in eachindex(new_columns)
-            Threads.@spawn new_columns[i] = df_columns[selected_columns[i]][selected_rows]
+            @spawn new_columns[i] = df_columns[selected_columns[i]][selected_rows]
         end
         return DataFrame(new_columns, idx, copycols=false)
     else
diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl
@@ -248,7 +248,6 @@ function _combine_rows_with_first!((firstrow,)::Ref{Any},
     @assert colnames isa NTuple{N, Symbol} where N
     @assert length(colnames) == length(outcols)
     len = length(gd)
-    gdidx = gd.idx
     starts = gd.starts
     ends = gd.ends
 
diff --git a/src/groupeddataframe/utils.jl b/src/groupeddataframe/utils.jl
@@ -5,7 +5,7 @@ function hashrows_col!(h::Vector{UInt},
                        v::AbstractVector{T},
                        rp::Nothing,
                        firstcol::Bool) where T
-    @spawn_for_chunks 1_000_000 for i in eachindex(h)
+    @spawn_for_chunks 100_000 for i in eachindex(h)
         @inbounds begin
             el = v[i]
             h[i] = hash(el, h[i])
@@ -33,18 +33,18 @@ function hashrows_col!(h::Vector{UInt},
         fira = firstindex(ra)
 
         hashes = Vector{UInt}(undef, length(rp))
-        @spawn_for_chunks 1_000_000 for i in eachindex(hashes)
+        @spawn_for_chunks 100_000 for i in eachindex(hashes)
             @inbounds hashes[i] = hash(rp[i+firp-1])
         end
 
         # here we rely on the fact that `DataAPI.refpool` has a continuous
         # block of indices
-        @spawn_for_chunks 1_000_000 for i in eachindex(h)
+        @spawn_for_chunks 100_000 for i in eachindex(h)
             @inbounds ref = ra[i+fira-1]
             @inbounds h[i] = hashes[ref+1-firp]
         end
     else
-        @spawn_for_chunks 1_000_000 for i in eachindex(h, v)
+        @spawn_for_chunks 100_000 for i in eachindex(h, v)
             @inbounds h[i] = hash(v[i], h[i])
         end
     end
@@ -332,10 +332,13 @@ function row_group_slots!(cols::NTuple{N, AbstractVector},
 
     lg = length(groups)
     nt = Threads.nthreads()
-    # disable threading if we are processing a small data frame or number of groups is large
-    if lg < 1_000_000 || ngroups > lg * (0.5 - 1 / (2 * nt)) / (2 * nt)
-       nt = 1
+    # make sure we are processing at least 100_000 rows per task if we do threading
+    if lg < 100_000 * nt
+       nt = max(1, lg ÷ 100_000)
     end
+    # if there are few rows per group limit the number of threads used
+    nt = clamp(round(Int, (lg / 4) / ngroups - 2), 1, nt)
+
     seen = fill(false, ngroups)
     seen_vec = Vector{Vector{Bool}}(undef, nt)
     seen_vec[1] = seen
diff --git a/src/join/composer.jl b/src/join/composer.jl
@@ -249,9 +249,9 @@ function compose_inner_table(joiner::DataFrameJoiner,
         right_ixs = right_ixs[csp_r]
     end
 
-    if Threads.nthreads() > 1 && length(left_ixs) >= 1_000_000
-        dfl_task = Threads.@spawn joiner.dfl[left_ixs, :]
-        dfr_noon_task = Threads.@spawn joiner.dfr[right_ixs, Not(joiner.right_on)]
+    if Threads.nthreads() > 1 && length(left_ixs) >= 100_000
+        dfl_task = @spawn joiner.dfl[left_ixs, :]
+        dfr_noon_task = @spawn joiner.dfr[right_ixs, Not(joiner.right_on)]
         dfl = fetch(dfl_task)
         dfr_noon = fetch(dfr_noon_task)
     else
@@ -384,20 +384,20 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique
 
     @assert col_idx == ncol(joiner.dfl_on) + 1
 
-    if Threads.nthreads() > 1 && target_nrow >= 1_000_000 && length(cols) > col_idx
+    if Threads.nthreads() > 1 && target_nrow >= 100_000 && length(cols) > col_idx
         @sync begin
             for col in eachcol(dfl_noon)
                 cols_i = left_idxs[col_idx]
-                Threads.@spawn _noon_compose_helper!(cols, _similar_left, cols_i,
-                                                     col, target_nrow, left_ixs, lil + 1,
-                                                     leftonly_ixs, loil)
+                @spawn _noon_compose_helper!(cols, _similar_left, cols_i,
+                                             col, target_nrow, left_ixs, lil + 1,
+                                             leftonly_ixs, loil)
                 col_idx += 1
             end
             @assert col_idx == ncol(joiner.dfl) + 1
             for col in eachcol(dfr_noon)
                 cols_i = col_idx
-                Threads.@spawn _noon_compose_helper!(cols, _similar_right, cols_i, col, target_nrow,
-                                                     right_ixs, lil + loil + 1, rightonly_ixs, roil)
+                @spawn _noon_compose_helper!(cols, _similar_right, cols_i, col, target_nrow,
+                                             right_ixs, lil + loil + 1, rightonly_ixs, roil)
                 col_idx += 1
             end
         end
@@ -422,7 +422,7 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique
                                               left_ixs, lil + 1, leftonly_ixs, loil)
         new_order = _count_sortperm(left_cols_idxs)
     end
-    if order == :right && !(issorted(right_ixs) && isempty(rightonly_ixs)) 
+    if order == :right && !(issorted(right_ixs) && isempty(rightonly_ixs))
         right_cols_idxs = _sort_compose_helper(nrow(joiner.dfr) + 1,
                                                1:nrow(joiner.dfr), target_nrow,
                                                right_ixs, lil + loil + 1, rightonly_ixs, roil)
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -175,13 +175,13 @@ function _spawn_for_chunks_helper(iter, lbody, basesize)
             nt = Threads.nthreads()
             len = length(x)
             if nt > 1 && len > basesize
-                tasks = [Threads.@spawn begin
-                                for i in p
-                                    local $(esc(lidx)) = @inbounds x[i]
-                                    $(esc(lbody))
-                                end
-                            end
-                            for p in split_indices(len, basesize)]
+                tasks = [@spawn begin
+                             for i in p
+                                 local $(esc(lidx)) = @inbounds x[i]
+                                 $(esc(lbody))
+                             end
+                         end
+                         for p in split_indices(len, basesize)]
                 foreach(wait, tasks)
             else
                 for i in eachindex(x)
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -3722,10 +3722,10 @@ end
 end
 
 @testset "groupby multithreading" begin
-    for x in (PooledArray(rand(1:10, 1_100_000)),
-              PooledArray(rand([1:9; missing], 1_100_000))),
-        y in (PooledArray(rand(["a", "b", "c", "d"], 1_100_000)),
-              PooledArray(rand(["a"; "b"; "c"; missing], 1_100_000)))
+    for x in (PooledArray(rand(1:10, 210_000)),
+              PooledArray(rand([1:9; missing], 210_000))),
+        y in (PooledArray(rand(["a", "b", "c", "d"], 210_000)),
+              PooledArray(rand(["a"; "b"; "c"; missing], 210_000)))
         df = DataFrame(x=x, y=y)
 
         # Checks are done by groupby_checked
@@ -3777,7 +3777,7 @@ end
     end
 
     Random.seed!(1234)
-    for levs in (100, 99_000), sz in (100_000, 1_100_000)
+    for levs in (100, 89_000), sz in (90_000, 210_000)
         df = DataFrame(x_int=rand(1:levs, sz))
         df.x_str = string.(df.x_int, pad=5)
         df.x_pool = PooledArray(df.x_str)
@@ -4401,7 +4401,7 @@ end
     @test getindex.(keys(groupby(df, [order(:x, rev=true)], sort=NamedTuple())), 1) == ["c", "b", "a"]
     @test getindex.(keys(groupby(df, :x, sort=(;rev=true))), 1) == ["c", "b", "a"]
     @test getindex.(keys(groupby(df, [:x], sort=(;rev=true))), 1) == ["c", "b", "a"]
-   
+
     # by default sorting is not applied as range of values is wide
     df = DataFrame(x=[2, 100, 2, 1, 100])
     @test getindex.(keys(groupby(df, :x)), 1) == [2, 100, 1]
@@ -4434,7 +4434,7 @@ end
     df2 = string.(df1, pad=3)
 
     for df in (df1, df2)
-        for col in (:a, "a", 1, :b, "b", 2, :c, "c", 3) 
+        for col in (:a, "a", 1, :b, "b", 2, :c, "c", 3)
             gdf = groupby(df, order(col))
             @test issorted(DataFrame(gdf)[:, col])
             @test all(x -> issorted(x.c), gdf)
diff --git a/test/indexing.jl b/test/indexing.jl
@@ -2022,7 +2022,7 @@ end
 include("indexing_offset.jl")
 
 @testset "threading correctness tests" begin
-    for x in (10, 1_100_000), y in 1:4
+    for x in (10, 110_000), y in 1:4
         vecvec = [rand(Int8, x) for _ in 1:y]
         df = DataFrame(vecvec, :auto, copycols=false)
         for rowrange in [:, 1:nrow(df)-5, collect(1:nrow(df)-5), axes(df, 1) .< nrow(df)-5],
diff --git a/test/insertion.jl b/test/insertion.jl
@@ -1331,7 +1331,7 @@ end
                             b=[22.5, 2.0, "b", 10.0, 5.0, missing],
                             c=[missing, 3.0, "c", 11.0, 6.0, 16.5],
                             d=[missing, missing, "d", missing, missing, 15])
-    for i in [1, 2, 4, 8, 16, 32, 64, 100, 1000, 10000, 20_000, 100_000]
+    for i in [1, 2, 4, 8, 16, 32, 64, 100, 1000, 10000, 20_000, 210_000]
         df = DataFrame()
         mat = Any[a + 100 * b + (iseven(b) ? 0.5 : 0) for a in 1:2, b in 1:i]
         tab = Tables.table(mat, header=Symbol.("x", 1:i))
diff --git a/test/join.jl b/test/join.jl
@@ -1510,79 +1510,77 @@ end
     @test m1[!, :a] == m2[!, :a]
 end
 
-if Sys.WORD_SIZE == 64
-    @testset "threaded correctness" begin
-        df1 = DataFrame(id=[1:10^6; 10^7+1:10^7+2])
-        df1.left_row = axes(df1, 1)
-        df2 = DataFrame(id=[1:10^6; 10^8+1:10^8+4])
-        df2.right_row = axes(df2, 1)
-
-        @test innerjoin(df1, df2, on=:id) ≅
-              DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
-        @test leftjoin(df1, df2, on=:id) ≅
-              DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
-                        right_row=[1:10^6; missing; missing])
-        @test rightjoin(df1, df2, on=:id) ≅
-              DataFrame(id=[1:10^6; 10^8+1:10^8+4],
-                        left_row=[1:10^6; fill(missing, 4)],
-                        right_row=1:10^6+4)
-        @test outerjoin(df1, df2, on=:id) ≅
-              DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
-                      left_row=[1:10^6+2; fill(missing, 4)],
-                      right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
-        @test semijoin(df1, df2, on=:id) ≅
-              DataFrame(id=1:10^6, left_row=1:10^6)
-        @test antijoin(df1, df2, on=:id) ≅
-              DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)
-
-        Random.seed!(1234)
-        for i in 1:4
-            df1 = df1[shuffle(axes(df1, 1)), :]
-            df2 = df2[shuffle(axes(df2, 1)), :]
-
-            @test sort!(innerjoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
-            @test sort!(leftjoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
-                          right_row=[1:10^6; missing; missing])
-            @test sort!(rightjoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=[1:10^6; 10^8+1:10^8+4],
-                              left_row=[1:10^6; fill(missing, 4)],
-                              right_row=1:10^6+4)
-            @test sort!(outerjoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
-                          left_row=[1:10^6+2; fill(missing, 4)],
-                          right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
-            @test sort!(semijoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=1:10^6, left_row=1:10^6)
-            @test sort!(antijoin(df1, df2, on=:id)) ≅
-                  DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)
-        end
+@testset "threaded correctness" begin
+    df1 = DataFrame(id=[1:10^5; 10^7+1:10^7+2])
+    df1.left_row = axes(df1, 1)
+    df2 = DataFrame(id=[1:10^5; 10^8+1:10^8+4])
+    df2.right_row = axes(df2, 1)
+
+    @test innerjoin(df1, df2, on=:id) ≅
+          DataFrame(id=1:10^5, left_row=1:10^5, right_row=1:10^5)
+    @test leftjoin(df1, df2, on=:id) ≅
+          DataFrame(id=[1:10^5; 10^7+1:10^7+2], left_row=1:10^5+2,
+                    right_row=[1:10^5; missing; missing])
+    @test rightjoin(df1, df2, on=:id) ≅
+          DataFrame(id=[1:10^5; 10^8+1:10^8+4],
+                    left_row=[1:10^5; fill(missing, 4)],
+                    right_row=1:10^5+4)
+    @test outerjoin(df1, df2, on=:id) ≅
+          DataFrame(id=[1:10^5; 10^7+1:10^7+2; 10^8+1:10^8+4],
+                  left_row=[1:10^5+2; fill(missing, 4)],
+                  right_row=[1:10^5; missing; missing; 10^5+1:10^5+4])
+    @test semijoin(df1, df2, on=:id) ≅
+          DataFrame(id=1:10^5, left_row=1:10^5)
+    @test antijoin(df1, df2, on=:id) ≅
+          DataFrame(id=10^7+1:10^7+2, left_row=10^5+1:10^5+2)
 
-        # test correctness of column order
-        df1 = DataFrame(a=Int8(1), id2=-[1:10^6; 10^7+1:10^7+2], b=Int8(2),
-                        id1=[1:10^6; 10^7+1:10^7+2], c=Int8(3), d=Int8(4))
-        df2 = DataFrame(e=Int8(5), id1=[1:10^6; 10^8+1:10^8+4], f=Int8(6), g=Int8(7),
-                        id2=-[1:10^6; 10^8+1:10^8+4], h=Int8(8))
-
-        @test innerjoin(df1, df2, on=[:id1, :id2]) ≅
-              DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
-                      c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
-        @test leftjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
-              DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
-                      c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
-        @test rightjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
-              DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
-                      c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
-        @test outerjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
-              DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
-                      c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
-        @test semijoin(df1, df2, on=[:id1, :id2]) ≅
-              DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6, c=Int8(3), d=Int8(4))
-        @test antijoin(df1, df2, on=[:id1, :id2]) ≅
-              DataFrame(a=Int8(1), id2=-(10^7+1:10^7+2), b=Int8(2), id1=(10^7+1:10^7+2),
-                      c=Int8(3), d=Int8(4))
+    Random.seed!(1234)
+    for i in 1:4
+        df1 = df1[shuffle(axes(df1, 1)), :]
+        df2 = df2[shuffle(axes(df2, 1)), :]
+
+        @test sort!(innerjoin(df1, df2, on=:id)) ≅
+              DataFrame(id=1:10^5, left_row=1:10^5, right_row=1:10^5)
+        @test sort!(leftjoin(df1, df2, on=:id)) ≅
+              DataFrame(id=[1:10^5; 10^7+1:10^7+2], left_row=1:10^5+2,
+                      right_row=[1:10^5; missing; missing])
+        @test sort!(rightjoin(df1, df2, on=:id)) ≅
+              DataFrame(id=[1:10^5; 10^8+1:10^8+4],
+                          left_row=[1:10^5; fill(missing, 4)],
+                          right_row=1:10^5+4)
+        @test sort!(outerjoin(df1, df2, on=:id)) ≅
+              DataFrame(id=[1:10^5; 10^7+1:10^7+2; 10^8+1:10^8+4],
+                      left_row=[1:10^5+2; fill(missing, 4)],
+                      right_row=[1:10^5; missing; missing; 10^5+1:10^5+4])
+        @test sort!(semijoin(df1, df2, on=:id)) ≅
+              DataFrame(id=1:10^5, left_row=1:10^5)
+        @test sort!(antijoin(df1, df2, on=:id)) ≅
+              DataFrame(id=10^7+1:10^7+2, left_row=10^5+1:10^5+2)
     end
+
+    # test correctness of column order
+    df1 = DataFrame(a=Int8(1), id2=-[1:10^5; 10^7+1:10^7+2], b=Int8(2),
+                    id1=[1:10^5; 10^7+1:10^7+2], c=Int8(3), d=Int8(4))
+    df2 = DataFrame(e=Int8(5), id1=[1:10^5; 10^8+1:10^8+4], f=Int8(6), g=Int8(7),
+                    id2=-[1:10^5; 10^8+1:10^8+4], h=Int8(8))
+
+    @test innerjoin(df1, df2, on=[:id1, :id2]) ≅
+          DataFrame(a=Int8(1), id2=-(1:10^5), b=Int8(2), id1=1:10^5,
+                  c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
+    @test leftjoin(df1, df2, on=[:id1, :id2])[1:10^5, :] ≅
+          DataFrame(a=Int8(1), id2=-(1:10^5), b=Int8(2), id1=1:10^5,
+                  c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
+    @test rightjoin(df1, df2, on=[:id1, :id2])[1:10^5, :] ≅
+          DataFrame(a=Int8(1), id2=-(1:10^5), b=Int8(2), id1=1:10^5,
+                  c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
+    @test outerjoin(df1, df2, on=[:id1, :id2])[1:10^5, :] ≅
+          DataFrame(a=Int8(1), id2=-(1:10^5), b=Int8(2), id1=1:10^5,
+                  c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
+    @test semijoin(df1, df2, on=[:id1, :id2]) ≅
+          DataFrame(a=Int8(1), id2=-(1:10^5), b=Int8(2), id1=1:10^5, c=Int8(3), d=Int8(4))
+    @test antijoin(df1, df2, on=[:id1, :id2]) ≅
+          DataFrame(a=Int8(1), id2=-(10^7+1:10^7+2), b=Int8(2), id1=(10^7+1:10^7+2),
+                  c=Int8(3), d=Int8(4))
 end
 
 @testset "matchmissing :notequal correctness" begin
diff --git a/test/multithreading.jl b/test/multithreading.jl
@@ -2,15 +2,6 @@ module TestMultithreading
 
 using Test, DataFrames
 
-
-@testset "pre-Julia 1.3 @spawn replacement" begin
-    t = @sync DataFrames.@spawn begin
-        sleep(1)
-        true
-    end
-    @test fetch(t) === true
-end
-
 @testset "split_indices" begin
     for len in 1:100, basesize in 1:10
         x = DataFrames.split_indices(len, basesize)
@@ -268,4 +259,4 @@ end
 
 end
 
-end # module
+end # module
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,9 +9,9 @@ anyerrors = false
 using DataFrames, Dates, Test, Random, InlineStrings
 
 if Threads.nthreads() < 2
-    @warn("Running with only one thread: correctness of parallel operations is not tested")
+    @warn("Running tests with only one thread: correctness of parallel operations is not checked")
 else
-    @show Threads.nthreads()
+    @info("Running tests with $(Threads.nthreads()) threads")
 end
 
 ambiguities_vec = Test.detect_ambiguities(DataFrames, recursive=true)