Skip to content

Commit 70d1e23

Browse files
authored
Improve allcombinations docstring + minor cleanups after #3256 (#3276)
1 parent fdd9193 commit 70d1e23

File tree

4 files changed

+31
-12
lines changed

4 files changed

+31
-12
lines changed

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
its parent
3333
([#3273](https://github.com/JuliaData/DataFrames.jl/pull/3273))
3434

35+
## Performance improvements
36+
37+
* `dropmissing` creates new columns in a single pass if `disallowmissing=true`
38+
([#3256](https://github.com/JuliaData/DataFrames.jl/pull/3256))
39+
3540
# DataFrames.jl v1.4.4 Patch Release Notes
3641

3742
## Bug fixes

src/abstractdataframe/abstractdataframe.jl

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -985,11 +985,19 @@ julia> dropmissing(df, [:x, :y])
985985

986986
# What column indices should disallowmissing be applied to
987987
cols_inds = BitSet(index(df)[cols])
988-
988+
989989
use_threads = Threads.nthreads() > 1 && ncol(df) > 1 && length(selected_rows) >= 100_000
990990
@sync for (i, col) in enumerate(eachcol(df))
991-
@spawn_or_run use_threads if disallowmissing && (i in cols_inds)
992-
new_columns[i] = Missings.disallowmissing(Base.view(col, selected_rows))
991+
@spawn_or_run use_threads if disallowmissing && (i in cols_inds) &&
992+
(Missing <: eltype(col) && eltype(col) !== Any)
993+
# Perform this path only if column eltype allows missing values
994+
# except Any, as nonmissingtype(Any) == Any.
995+
# Under these conditions Missings.disallowmissing must allocate
996+
# a fresh column
997+
col_sel = Base.view(col, selected_rows)
998+
new_col = Missings.disallowmissing(col_sel)
999+
@assert new_col !== col_sel
1000+
new_columns[i] = new_col
9931001
else
9941002
new_columns[i] = col[selected_rows]
9951003
end
@@ -3421,4 +3429,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta
34213429
r = min(state + itr.n - 1, last_idx)
34223430
return view(itr.c, state:r, :), r + 1
34233431
end
3424-

src/dataframe/dataframe.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,6 +1472,7 @@ allcombinations(::Type{DataFrame}, pairs::Pair{<:AbstractString, <:Any}...) =
14721472
allcombinations(DataFrame; kwargs...)
14731473
14741474
Create a `DataFrame` from all combinations of values in passed arguments.
1475+
The first passed values vary fastest.
14751476
14761477
Arguments associating a column name with values to expand can be specified
14771478
either as `Pair`s passed as positional arguments, or as keyword arguments.

test/data.jl

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,11 @@ end
173173
@test isempty(dropmissing!(DataFrame())) && dropmissing!(DataFrame()) isa DataFrame
174174
df = DataFrame(a=1:3, b=4:6)
175175
dfv = @view df[:, 2:1]
176-
# TODO: re-enable after https://github.com/JuliaData/DataFrames.jl/issues/3272 is resolved
177-
# @test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
176+
@test isempty(dropmissing(dfv)) && dropmissing(dfv) isa DataFrame
178177
@test_throws ArgumentError dropmissing!(dfv)
179-
@test_throws ArgumentError dropmissing(df1, [])
180-
@test_throws ArgumentError dropmissing!(df1, [])
181-
178+
@test_throws ArgumentError dropmissing(df1, [])
179+
@test_throws ArgumentError dropmissing!(df1, [])
180+
182181
df = DataFrame(a=[1, missing, 3])
183182
sdf = view(df, :, :)
184183
@test dropmissing(sdf) == DataFrame(a=[1, 3])
@@ -248,19 +247,26 @@ end
248247
# Deterministic drop mask: IF remainder of index position divided by 10 == column index THEN missing
249248
for i in 1:ncol(df)
250249
missing_mask = (eachindex(df[!, i]) .% 10) .== i
251-
df[missing_mask, i] .= missing
250+
df[missing_mask, i] .= missing
252251
end
253-
252+
254253
notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > ncol(df)]
255254
@test dropmissing(df) df[notmissing_rows, :]
256-
255+
257256
cols = [:x1, :x2]
258257
notmissing_rows = [i for i in 1:N_rows if i % 10 == 0 || i % 10 > length(cols)]
259258
returned = dropmissing(df, cols)
260259
@test returned df[notmissing_rows, :]
261260
@test eltype(returned[:, cols[1]]) == nonmissingtype(eltype(df[:, cols[1]]))
262261
@test eltype(returned[:, cols[2]]) == nonmissingtype(eltype(df[:, cols[2]]))
263262
@test eltype(returned[:, ncol(df)]) == eltype(df[:, ncol(df)])
263+
264+
# correct handling of not propagating views
265+
df = DataFrame(a=1:3, b=Any[11, missing, 13])
266+
df2 = dropmissing(df)
267+
@test df2 == DataFrame(a=[1, 3], b=[11, 13])
268+
@test df2.a isa Vector{Int}
269+
@test df2.b isa Vector{Any}
264270
end
265271

266272
@testset "deleteat! https://github.com/JuliaLang/julia/pull/41646 bug workaround" begin

0 commit comments

Comments
 (0)