Skip to content

Commit fdd9193

Browse files
authored
Improve performance of dropmissing (#3256)
1 parent 4e44ee8 commit fdd9193

File tree

3 files changed

+84
-2
lines changed

3 files changed

+84
-2
lines changed

docs/src/lib/functions.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ This is a list of operations that currently make use of multi-threading:
2525
* a transformation produces one row per group and the passed transformation
2626
is a custom function (i.e. not for standard reductions, which use
2727
optimized single-threaded methods).
28+
- `dropmissing` when the provided data frame has more than 1 column and `view=false`
29+
(subsetting of individual columns is spawned in separate tasks).
2830

2931
In general at least Julia 1.4 is required to ensure that multi-threading is used
3032
and the Julia process must be started with more than one thread. Some operations

src/abstractdataframe/abstractdataframe.jl

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -970,15 +970,34 @@ julia> dropmissing(df, [:x, :y])
970970
@inline function dropmissing(df::AbstractDataFrame,
971971
cols::Union{ColumnIndex, MultiColumnIndex}=:;
972972
view::Bool=false, disallowmissing::Bool=!view)
973+
# Identify Bool mask of which rows have no missings
973974
rowidxs = completecases(df, cols)
974975
if view
975976
if disallowmissing
976977
throw(ArgumentError("disallowmissing=true is incompatible with view=true"))
977978
end
978979
return Base.view(df, rowidxs, :)
979980
else
980-
newdf = df[rowidxs, :]
981-
disallowmissing && disallowmissing!(newdf, cols)
981+
# Faster when there are many columns (indexing with integers than via Bool mask)
982+
# or when there are many missings (as we skip a lot of iterations)
983+
selected_rows = _findall(rowidxs)
984+
new_columns = Vector{AbstractVector}(undef, ncol(df))
985+
986+
# What column indices should disallowmissing be applied to
987+
cols_inds = BitSet(index(df)[cols])
988+
989+
use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
990+
@sync for (i, col) in enumerate(eachcol(df))
991+
@spawn_or_run use_threads if disallowmissing && (i in cols_inds)
992+
new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows))
993+
else
994+
new_columns[i] = col[selected_rows]
995+
end
996+
end
997+
998+
newdf = DataFrame(new_columns, copy(index(df)), copycols=false)
999+
1000+
_copy_all_note_metadata!(newdf, df)
9821001
return newdf
9831002
end
9841003
end

test/data.jl

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,17 @@ end
168168
@test df1b == df1
169169
end
170170

171+
# Zero column case
172+
@test isempty(dropmissing(DataFrame())) && dropmissing(DataFrame()) isa DataFrame
173+
@test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
174+
df = DataFrame(a=1:3, b=4:6)
175+
dfv = @view df[:, 2:1]
176+
# TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved
177+
# @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
178+
@test_throws ArgumentError dropmissing!(dfv)
179+
@test_throws ArgumentError dropmissing(df1, [])
180+
@test_throws ArgumentError dropmissing!(df1, [])
181+
171182
df = DataFrame(a=[1, missing, 3])
172183
sdf = view(df, :, :)
173184
@test dropmissing(sdf) == DataFrame(a=[1, 3])
@@ -186,6 +197,16 @@ end
186197
@test eltype(df2.a) == Union{Int, Missing}
187198
@test df.a == df2.a == [1, 3]
188199

200+
# view=true
201+
df = DataFrame(a=[1, missing, 3])
202+
@test dropmissing(df, view=false) == DataFrame(a=[1, 3])
203+
@test dropmissing(df, view=true) == view(df, [1, 3], :)
204+
@test typeof(dropmissing(df, view=true)) <: SubDataFrame
205+
@test eltype(dropmissing(df, view=true, disallowmissing=false).a) == Union{Int, Missing}
206+
@test_throws ArgumentError dropmissing(df, view=true, disallowmissing=true)
207+
@test eltype(dropmissing(df, view=false, disallowmissing=false).a) == Union{Int, Missing}
208+
@test eltype(dropmissing(df, view=false, disallowmissing=true).a) == Int
209+
189210
a = [1, 2]
190211
df = DataFrame(a=a, copycols=false)
191212
@test dropmissing!(df) === df
@@ -200,6 +221,46 @@ end
200221
df = DataFrame(b=b)
201222
@test eltype(dropmissing(df).b) == Int
202223
@test eltype(dropmissing!(df).b) == Int
224+
225+
# disallowmissing argument
226+
a = Union{Int, Missing}[3, 4]
227+
b = Union{Int, Missing}[1, 2]
228+
df = DataFrame(;a,b)
229+
@test eltype(dropmissing(df, disallowmissing=false).a) == Union{Int, Missing}
230+
@test eltype(dropmissing!(copy(df), disallowmissing=false).a) == Union{Int, Missing}
231+
@test eltype(dropmissing(df, disallowmissing=true).a) == Int
232+
@test eltype(dropmissing!(copy(df), disallowmissing=true).a) == Int
233+
@test eltype(dropmissing(df, :a, disallowmissing=true).a) == Int
234+
@test eltype(dropmissing!(copy(df), :a, disallowmissing=true).a) == Int
235+
@test eltype(dropmissing(df, :b, disallowmissing=true).a) == Union{Int, Missing}
236+
@test eltype(dropmissing!(copy(df), :b, disallowmissing=true).a) == Union{Int, Missing}
237+
238+
# CategoricalArrays
239+
c = categorical([1, 2, 1, missing])
240+
df = DataFrame(c=c)
241+
@test dropmissing(df) == DataFrame(c=categorical([1, 2, 1]))
242+
@test eltype(dropmissing(df).c) == CategoricalValue{Int, UInt32}
243+
@test eltype(dropmissing!(df).c) == CategoricalValue{Int, UInt32}
244+
245+
# Multithreaded execution test (must be at least ncol > 1, nrow > 100_000)
246+
N_rows, N_cols = 110_000, 3
247+
df = DataFrame([rand(N_rows) for i in 1:N_cols], :auto) |> allowmissing
248+
# Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
249+
for i in 1:ncol(df)
250+
missing_mask = (eachindex(df[!, i]) .% 10) .== i
251+
df[missing_mask, i] .= missing
252+
end
253+
254+
notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]
255+
@test dropmissing(df) df[notmissing_rows, :]
256+
257+
cols = [:x1, :x2]
258+
notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)]
259+
returned = dropmissing(df, cols)
260+
@test returned df[notmissing_rows, :]
261+
@test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]]))
262+
@test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]]))
263+
@test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)])
203264
end
204265

205266
@testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin

0 commit comments

Comments
 (0)