Skip to content

Commit c0c8cd3

Browse files
authored
use multithreading in basic operations (#2647)
1 parent 639e3b1 commit c0c8cd3

File tree

9 files changed

+261
-33
lines changed

9 files changed

+261
-33
lines changed

NEWS.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,12 @@
4646
values in `on` columns. These aspects of input data frames might affect the
4747
order of rows produced in the output
4848
([#2612](https://github.com/JuliaData/DataFrames.jl/pull/2612),
49-
[#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])
49+
([#2622][https://github.com/JuliaData/DataFrames.jl/pull/2622])
50+
* `DataFrame` constructor, `copy`, `getindex`, `select`, `select!`, `transform`,
51+
`transform!`, and `combine` functions now use multiple threads in selected operations
52+
([#2647](https://github.com/JuliaData/DataFrames.jl/pull/2647)),
53+
([#2588](https://github.com/JuliaData/DataFrames.jl/pull/2588)),
54+
([#2574](https://github.com/JuliaData/DataFrames.jl/pull/2574))
5055

5156
# DataFrames v0.22 Release Notes
5257

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
using BenchmarkTools
2+
using DataFrames
3+
using PooledArrays
4+
using Random
5+
6+
@show Threads.nthreads()
7+
8+
Random.seed!(1234)
9+
ref_dfi = DataFrame(rand(1:10^4, 10^7, 4), :auto)
10+
ref_dfs = string.(ref_dfi)
11+
ref_dfp = mapcols(PooledArray, ref_dfs)
12+
13+
res = DataFrame(rows=Int[],cols=Int[], type=String[], op=String[], time=Float64[])
14+
15+
for x in (10, 10^6-1, 10^6, 10^7), y in 1:4
16+
dfi = ref_dfi[1:x, 1:y]
17+
dfs = ref_dfs[1:x, 1:y]
18+
dfp = ref_dfp[1:x, 1:y]
19+
20+
@show (x, y) # ping that the process is alive
21+
push!(res, (x, y, "integer", "copy", @belapsed DataFrame($dfi)))
22+
push!(res, (x, y, "string", "copy", @belapsed DataFrame($dfs)))
23+
push!(res, (x, y, "pooled", "copy", @belapsed DataFrame($dfp)))
24+
push!(res, (x, y, "integer", ":", @belapsed $dfi[:, :]))
25+
push!(res, (x, y, "string", ":", @belapsed $dfs[:, :]))
26+
push!(res, (x, y, "pooled", ":", @belapsed $dfp[:, :]))
27+
push!(res, (x, y, "integer", "1:end-5", @belapsed $dfi[1:end-5, :]))
28+
push!(res, (x, y, "string", "1:end-5", @belapsed $dfs[1:end-5, :]))
29+
push!(res, (x, y, "pooled", "1:end-5", @belapsed $dfp[1:end-5, :]))
30+
push!(res, (x, y, "integer", "1:5", @belapsed $dfi[1:5, :]))
31+
push!(res, (x, y, "string", "1:5", @belapsed $dfs[1:1:5, :]))
32+
push!(res, (x, y, "pooled", "1:5", @belapsed $dfp[1:1:5, :]))
33+
end
34+
35+
res.time *= 1_000
36+
37+
@show Threads.nthreads()
38+
@show unstack(res, [:cols, :type, :op], :rows, :time)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
julia -t 1 constructor_and_indexing_performance.jl
2+
julia -t 2 constructor_and_indexing_performance.jl
3+
julia -t 4 constructor_and_indexing_performance.jl
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
using CategoricalArrays
2+
using DataFrames
3+
using PooledArrays
4+
using Random
5+
6+
fullgc() = (GC.gc(true); GC.gc(true); GC.gc(true); GC.gc(true))
7+
8+
@assert length(ARGS) == 7
9+
@assert ARGS[3] in ["int", "pool", "cat", "str"]
10+
@assert ARGS[4] in ["uniq", "dup", "manydup"]
11+
@assert ARGS[5] in ["sort", "rand"]
12+
@assert ARGS[6] in ["1", "2"]
13+
@assert ARGS[7] in ["inner", "left", "right", "outer", "semi", "anti"]
14+
15+
@info ARGS
16+
17+
llen = parse(Int, ARGS[1])
18+
rlen = parse(Int, ARGS[2])
19+
@assert llen > 1000
20+
@assert rlen > 2000
21+
22+
pad = maximum(length.(string.((llen, rlen))))
23+
24+
if ARGS[3] == "int"
25+
if ARGS[4] == "uniq"
26+
col1 = [1:llen;]
27+
col2 = [1:rlen;]
28+
elseif ARGS[4] == "dup"
29+
col1 = repeat(1:llen ÷ 2, inner=2)
30+
col2 = repeat(1:rlen ÷ 2, inner=2)
31+
else
32+
@assert ARGS[4] == "manydup"
33+
col1 = repeat(1:llen ÷ 20, inner=20)
34+
col2 = repeat(1:rlen ÷ 20, inner=20)
35+
end
36+
elseif ARGS[3] == "pool"
37+
if ARGS[4] == "dup"
38+
col1 = PooledArray(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
39+
col2 = PooledArray(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
40+
else
41+
@assert ARGS[4] == "manydup"
42+
col1 = PooledArray(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
43+
col2 = PooledArray(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
44+
end
45+
elseif ARGS[3] == "cat"
46+
if ARGS[4] == "dup"
47+
col1 = categorical(repeat(string.(1:llen ÷ 2, pad=pad), inner=2))
48+
col2 = categorical(repeat(string.(1:rlen ÷ 2, pad=pad), inner=2))
49+
else
50+
@assert ARGS[4] == "manydup"
51+
col1 = categorical(repeat(string.(1:llen ÷ 20, pad=pad), inner=20))
52+
col2 = categorical(repeat(string.(1:rlen ÷ 20, pad=pad), inner=20))
53+
end
54+
else
55+
@assert ARGS[3] == "str"
56+
if ARGS[4] == "uniq"
57+
col1 = string.(1:llen, pad=pad)
58+
col2 = string.(1:rlen, pad=pad)
59+
elseif ARGS[4] == "dup"
60+
col1 = repeat(string.(1:llen ÷ 2, pad=pad), inner=2)
61+
col2 = repeat(string.(1:rlen ÷ 2, pad=pad), inner=2)
62+
else
63+
@assert ARGS[4] == "manydup"
64+
col1 = repeat(string.(1:llen ÷ 20, pad=pad), inner=20)
65+
col2 = repeat(string.(1:rlen ÷ 20, pad=pad), inner=20)
66+
end
67+
end
68+
69+
Random.seed!(1234)
70+
71+
if ARGS[5] == "rand"
72+
shuffle!(col1)
73+
shuffle!(col2)
74+
else
75+
@assert ARGS[5] == "sort"
76+
end
77+
78+
const joinfun = Dict("inner" => innerjoin, "left" => leftjoin,
79+
"right" => rightjoin, "outer" => outerjoin,
80+
"semi" => semijoin, "anti" => antijoin)[ARGS[7]]
81+
82+
if ARGS[6] == "1"
83+
df1 = DataFrame(id1 = col1)
84+
df2 = DataFrame(id1 = col2)
85+
joinfun(df1[1:1000, :], df2[1:2000, :], on=:id1)
86+
joinfun(df2[1:2000, :], df1[1:1000, :], on=:id1)
87+
fullgc()
88+
@time joinfun(df1, df2, on=:id1)
89+
fullgc()
90+
@time joinfun(df2, df1, on=:id1)
91+
else
92+
@assert ARGS[6] == "2"
93+
df1 = DataFrame(id1 = col1, id2 = col1)
94+
df2 = DataFrame(id1 = col2, id2 = col2)
95+
joinfun(df1[1:1000, :], df2[1:2000, :], on=[:id1, :id2])
96+
joinfun(df2[1:2000, :], df1[1:1000, :], on=[:id1, :id2])
97+
fullgc()
98+
@time joinfun(df1, df2, on=[:id1, :id2])
99+
fullgc()
100+
@time joinfun(df2, df1, on=[:id1, :id2])
101+
end

benchmarks/joins/run.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
julia runtests.jl 100000 50000000 inner
2+
julia runtests.jl 5000000 10000000 inner
3+
julia runtests.jl 100000 50000000 left
4+
julia runtests.jl 5000000 10000000 left
5+
julia runtests.jl 100000 50000000 right
6+
julia runtests.jl 5000000 10000000 right
7+
julia runtests.jl 100000 50000000 outer
8+
julia runtests.jl 5000000 10000000 outer
9+
julia runtests.jl 100000 50000000 semi
10+
julia runtests.jl 5000000 10000000 semi
11+
julia runtests.jl 100000 50000000 anti
12+
julia runtests.jl 5000000 10000000 anti
File renamed without changes.

src/dataframe/dataframe.jl

Lines changed: 83 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -190,21 +190,19 @@ struct DataFrame <: AbstractDataFrame
190190

191191
# we write into columns as we know that it is guaranteed
192192
# that it was freshly allocated in the outer constructor
193-
for (i, col) in enumerate(columns)
194-
# check for vectors first as they are most common
195-
if col isa AbstractRange
196-
columns[i] = collect(col)
197-
elseif col isa AbstractVector
198-
columns[i] = copycols ? copy(col) : col
199-
elseif col isa Union{AbstractArray{<:Any, 0}, Ref}
200-
x = col[]
201-
columns[i] = fill!(Tables.allocatecolumn(typeof(x), len), x)
193+
@static if VERSION >= v"1.4"
194+
if copycols && len >= 1_000_000 && length(columns) > 1 && Threads.nthreads() > 1
195+
@sync for i in eachindex(columns)
196+
Threads.@spawn columns[i] = _preprocess_column(columns[i], len, copycols)
197+
end
202198
else
203-
if col isa AbstractArray
204-
throw(ArgumentError("adding AbstractArray other than AbstractVector " *
205-
"as a column of a data frame is not allowed"))
199+
for i in eachindex(columns)
200+
columns[i] = _preprocess_column(columns[i], len, copycols)
206201
end
207-
columns[i] = fill!(Tables.allocatecolumn(typeof(col), len), col)
202+
end
203+
else
204+
for i in eachindex(columns)
205+
columns[i] = _preprocess_column(columns[i], len, copycols)
208206
end
209207
end
210208

@@ -216,6 +214,22 @@ struct DataFrame <: AbstractDataFrame
216214
end
217215
end
218216

217+
function _preprocess_column(col::Any, len::Integer, copycols::Bool)
218+
if col isa AbstractRange
219+
return collect(col)
220+
elseif col isa AbstractVector
221+
return copycols ? copy(col) : col
222+
elseif col isa Union{AbstractArray{<:Any, 0}, Ref}
223+
x = col[]
224+
return fill!(Tables.allocatecolumn(typeof(x), len), x)
225+
elseif col isa AbstractArray
226+
throw(ArgumentError("adding AbstractArray other than AbstractVector " *
227+
"as a column of a data frame is not allowed"))
228+
else
229+
return fill!(Tables.allocatecolumn(typeof(col), len), col)
230+
end
231+
end
232+
219233
DataFrame(df::DataFrame; copycols::Bool=true) = copy(df, copycols=copycols)
220234

221235
function DataFrame(pairs::Pair{Symbol, <:Any}...; makeunique::Bool=false,
@@ -502,34 +516,75 @@ end
502516
throw(BoundsError(df, (row_inds, col_inds)))
503517
end
504518
selected_columns = index(df)[col_inds]
505-
# Computing integer indices once for all columns is faster
506-
selected_rows = T === Bool ? findall(row_inds) : row_inds
507-
new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)[selected_columns]]
508-
return DataFrame(new_columns, Index(_names(df)[selected_columns]), copycols=false)
519+
520+
u = _names(df)[selected_columns]
521+
lookup = Dict{Symbol, Int}(zip(u, 1:length(u)))
522+
# use this constructor to avoid checking twice if column names are not
523+
# duplicate as index(df)[col_inds] already checks this
524+
idx = Index(lookup, u)
525+
526+
if length(selected_columns) == 1
527+
return DataFrame(AbstractVector[_columns(df)[selected_columns[1]][row_inds]],
528+
idx, copycols=false)
529+
else
530+
# Computing integer indices once for all columns is faster
531+
selected_rows = T === Bool ? findall(row_inds) : row_inds
532+
@static if VERSION >= v"1.4"
533+
if length(selected_rows) >= 1_000_000 && Threads.nthreads() > 1
534+
new_columns = Vector{AbstractVector}(undef, length(selected_columns))
535+
@sync for i in eachindex(new_columns)
536+
Threads.@spawn new_columns[i] = _columns(df)[selected_columns[i]][selected_rows]
537+
end
538+
return DataFrame(new_columns, idx, copycols=false)
539+
else
540+
return DataFrame(AbstractVector[_columns(df)[i][selected_rows] for i in selected_columns],
541+
idx, copycols=false)
542+
end
543+
else
544+
return DataFrame(AbstractVector[_columns(df)[i][selected_rows] for i in selected_columns],
545+
idx, copycols=false)
546+
end
547+
end
509548
end
510549

511550
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, ::Colon) where T
512551
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
513552
throw(BoundsError(df, (row_inds, :)))
514553
end
515-
# Computing integer indices once for all columns is faster
516-
selected_rows = T === Bool ? findall(row_inds) : row_inds
517-
new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)]
518-
return DataFrame(new_columns, copy(index(df)), copycols=false)
554+
idx = copy(index(df))
555+
556+
if ncol(df) == 1
557+
return DataFrame(AbstractVector[_columns(df)[1][row_inds]], idx, copycols=false)
558+
else
559+
# Computing integer indices once for all columns is faster
560+
selected_rows = T === Bool ? findall(row_inds) : row_inds
561+
@static if VERSION >= v"1.4"
562+
if length(selected_rows) >= 1_000_000 && Threads.nthreads() > 1
563+
new_columns = Vector{AbstractVector}(undef, ncol(df))
564+
@sync for i in eachindex(new_columns)
565+
Threads.@spawn new_columns[i] = _columns(df)[i][selected_rows]
566+
end
567+
return DataFrame(new_columns, idx, copycols=false)
568+
else
569+
return DataFrame(AbstractVector[dv[selected_rows] for dv in _columns(df)],
570+
idx, copycols=false)
571+
end
572+
else
573+
return DataFrame(AbstractVector[dv[selected_rows] for dv in _columns(df)],
574+
idx, copycols=false)
575+
end
576+
end
519577
end
520578

521-
@inline Base.getindex(df::DataFrame, row_inds::Not,
522-
col_inds::MultiColumnIndex) =
579+
@inline Base.getindex(df::DataFrame, row_inds::Not, col_inds::MultiColumnIndex) =
523580
df[axes(df, 1)[row_inds], col_inds]
524581

525582
# df[:, MultiColumnIndex] => DataFrame
526-
Base.getindex(df::DataFrame, row_ind::Colon,
527-
col_inds::MultiColumnIndex) =
583+
Base.getindex(df::DataFrame, row_ind::Colon, col_inds::MultiColumnIndex) =
528584
select(df, col_inds, copycols=true)
529585

530586
# df[!, MultiColumnIndex] => DataFrame
531-
Base.getindex(df::DataFrame, row_ind::typeof(!),
532-
col_inds::MultiColumnIndex) =
587+
Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::MultiColumnIndex) =
533588
select(df, col_inds, copycols=false)
534589

535590
##############################################################################
@@ -875,11 +930,7 @@ copies of column vectors in `df`.
875930
If `copycols=false`, return a new `DataFrame` sharing column vectors with `df`.
876931
"""
877932
function Base.copy(df::DataFrame; copycols::Bool=true)
878-
if copycols
879-
df[:, :]
880-
else
881-
DataFrame(_columns(df), _names(df), copycols=false)
882-
end
933+
return DataFrame(copy(_columns(df)), copy(index(df)), copycols=copycols)
883934
end
884935

885936
"""

test/constructors.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,4 +365,11 @@ end
365365
@test_throws ArgumentError DataFrame([Int, Float64], ["a", "b"], 2)
366366
end
367367

368+
@testset "threading correctness tests" begin
369+
for x in (10, 2*10^6), y in 1:4
370+
df = DataFrame(rand(x, y), :auto)
371+
@test df == copy(df)
372+
end
373+
end
374+
368375
end # module

test/indexing.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,4 +2009,15 @@ if VERSION >= v"1.5"
20092009
include("indexing_offset.jl")
20102010
end
20112011

2012+
@testset "threading correctness tests" begin
2013+
for x in (10, 2*10^6), y in 1:4
2014+
mat = rand(x, y)
2015+
df = DataFrame(mat, :auto)
2016+
for rowrange in [:, 1:nrow(df)-5, collect(1:nrow(df)-5), axes(df, 1) .< nrow(df)-5],
2017+
colrange in [:, axes(df, 2), collect(axes(df, 2)), 1:ncol(df) - 1]
2018+
@test DataFrame(mat[rowrange, colrange], :auto) == df[rowrange, colrange]
2019+
end
2020+
end
2021+
end
2022+
20122023
end # module

0 commit comments

Comments
 (0)