diff --git a/Project.toml b/Project.toml index 386490985d..cc56cb70b2 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "1.5.0" Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Future = "9fa8497b-333b-5362-9e8d-4d0656e87820" +InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" IteratorInterfaceExtensions = "82899510-4779-5014-852e-03e436cf321d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -22,6 +23,7 @@ SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TableTraits = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] @@ -35,6 +37,7 @@ Missings = "0.4.2, 1" PooledArrays = "1.4.2" PrettyTables = "2.1" Reexport = "0.1, 0.2, 1" +SentinelArrays = "1.2" ShiftedArrays = "1, 2" SnoopPrecompile = "1" SortingAlgorithms = "0.1, 0.2, 0.3, 1" diff --git a/docs/src/man/basics.md b/docs/src/man/basics.md index 300b503a90..20c1d2405e 100644 --- a/docs/src/man/basics.md +++ b/docs/src/man/basics.md @@ -16,7 +16,7 @@ or ```julia julia> ] # ']' should be pressed -(@v1.6) pkg> add DataFrames +(@v1.9) pkg> add DataFrames ``` If you want to make sure everything works as expected you can run the tests @@ -35,9 +35,9 @@ you have installed with the `status` command. ```julia julia> ] -(@v1.6) pkg> status DataFrames - Status `C:\Users\TeAmp0is0N\.julia\environments\v1.6\Project.toml` - [a93c6f00] DataFrames v1.1.1 +(@v1.9) pkg> status DataFrames + Status `~\v1.6\Project.toml` + [a93c6f00] DataFrames v1.5.0 ``` Throughout the rest of the tutorial we will assume that you have installed the @@ -52,6 +52,40 @@ The most fundamental type provided by DataFrames.jl is `DataFrame`, where typically each row is interpreted as an observation and each column as a feature. +!!! note "Advanced installation configuration" + + **Advanced installation settings.** + DataFrames.jl puts in extra time and effort when the package is being built + (precompiled) to make sure it is more responsive when you are using it. + However, in some scenarios users might want to avoid this extra + precompilaion effort to reduce the time needed to build the package and + later to load it. To disable precompilation of DataFrames.jl in your current + project you need to install the + [SnoopPrecompile.jl](https://timholy.github.io/SnoopCompile.jl/stable/snoop_pc/) + and [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl) + packages and then run the following code: + ``` + using SnoopPrecompile, Preferences + Preferences.set_preferences!(SnoopPrecompile, + "skip_precompile" => union(Preferences.load_preference(SnoopPrecompile, + "skip_precompile", + String[]), + ["DataFrames"]); + force=true) + ``` + If you later would want to re-enable precompilation of DataFrames.jl you + can do it using the following commands: + ``` + using SnoopPrecompile, Preferences + Preferences.set_preferences!(SnoopPrecompile, + "skip_precompile" => + filter(!=("DataFrames"), + Preferences.load_preference(SnoopPrecompile, + "skip_precompile", + String[])); + force=true) + ``` + ## Constructors and Basic Utility Functions ### Constructors @@ -1785,7 +1819,7 @@ in them: julia> select(german, Not(["Age", "Saving accounts", "Checking account", "Credit amount", "Purpose"])) 1000×5 DataFrame - Row │ id Sex Job Housing Duration + Row │ id Sex Job Housing Duration │ Int64 String7 Int64 String7 Int64 ──────┼────────────────────────────────────────── 1 │ 0 male 2 own 6 diff --git a/src/DataFrames.jl b/src/DataFrames.jl index a2a652154a..d32ffe99b9 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -11,6 +11,8 @@ using PrettyTables using Random using Tables: ByRow import SnoopPrecompile +import SentinelArrays +import InlineStrings import DataAPI, DataAPI.allcombinations, diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index fe6fb842f5..7b7779e813 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1892,6 +1892,12 @@ function Base.reduce(::typeof(vcat), return res end +# definition needed to avoid dispatch ambiguity +Base.reduce(::typeof(vcat), + dfs::SentinelArrays.ChainedVector{T, A} where {T<:AbstractDataFrame, + A<:AbstractVector{T}}) = + reduce(vcat, collect(AbstractDataFrame, dfs)) + function _vcat(dfs::AbstractVector{AbstractDataFrame}; cols::Union{Symbol, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal) diff --git a/src/other/precompile.jl b/src/other/precompile.jl index 916a7880b5..c8584adc84 100644 --- a/src/other/precompile.jl +++ b/src/other/precompile.jl @@ -11,8 +11,8 @@ SnoopPrecompile.@precompile_all_calls begin combine(df, :c, [:c :f] .=> [sum, mean, std], :c => :d, [:a, :c] => cor) transform(df, :c, [:c :f] .=> [sum, mean, std], :c => :d, [:a, :c] => cor) groupby(df, :a) - groupby(df, :q) groupby(df, :p) + groupby(df, :q) gdf = groupby(df, :b) combine(gdf, :c, [:c :f] .=> [sum, mean, std], :c => :d, [:a, :c] => cor) transform(gdf, :c, [:c :f] .=> [sum, mean, std], :c => :d, [:a, :c] => cor) @@ -22,16 +22,52 @@ SnoopPrecompile.@precompile_all_calls begin outerjoin(df, df, on=:a, makeunique=true) outerjoin(df, df, on=:b, makeunique=true) outerjoin(df, df, on=:c, makeunique=true) - semijoin(df, df, on=:a) - semijoin(df, df, on=:b) - semijoin(df, df, on=:c) leftjoin!(df, DataFrame(a=[2, 5, 3, 1, 0]), on=:a) leftjoin!(df, DataFrame(b=["a", "b", "c", "d", "e"]), on=:b) leftjoin!(df, DataFrame(c=1:5), on=:c) reduce(vcat, [df, df]) show(IOBuffer(), df) subset(df, :q) - @view df[1:3, :] + subset!(copy(df), :q) + df[:, 1:2] + df[1:2, :] + df[1:2, 1:2] @view df[:, 1:2] + @view df[1:2, :] + @view df[1:2, 1:2] transform!(df, :c, [:c :f] .=> [sum, mean, std], :c => :d, [:a, :c] => cor) + deleteat!(df, 1) + append!(df, copy(df)) + push!(df, copy(df[1, :])) + eachrow(df) + eachcol(df) + empty(df) + empty!(copy(df)) + filter(:q => identity, df) + filter!(:q => identity, df) + first(df) + last(df) + hcat(df, df, makeunique=true) + issorted(df) + pop!(df) + popfirst!(df) + repeat(df, 2) + reverse(df) + reverse!(df) + unique(df, :a) + unique!(df, :a) + wide = DataFrame(id=1:6, + a=repeat(1:3, inner=2), + b=repeat(1.0:2.0, inner=3), + c=repeat(1.0:1.0, inner=6), + d=repeat(1.0:3.0, inner=2)) + long = stack(wide) + unstack(long) + unstack(long, :variable, :value, combine=sum) + flatten(DataFrame(a=[[1, 2], [3, 4]], b=[1, 2]), :a) + dropmissing(DataFrame(a=[1, 2, 3, missing], b=["a", missing, "c", "d"])) + df = DataFrame(rand(20, 2), :auto) + df.id = repeat(1:2, 10) + combine(df, AsTable(r"x") .=> [ByRow(sum), ByRow(mean)]) + combine(groupby(df, :id), AsTable(r"x") .=> [ByRow(sum), ByRow(mean)]) end diff --git a/test/dataframe.jl b/test/dataframe.jl index d338aa2fed..febfe4cd35 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1892,6 +1892,11 @@ end DataFrame(c=[missing, missing])) end +@testset "vcat ChainedVector ambiguity" begin + dfs = DataFrames.SentinelArrays.ChainedVector([[DataFrame(a=1)], [DataFrame(a=2)]]) + @test reduce(vcat, dfs) == DataFrame(a=1:2) +end + @testset "names for Type, predicate + standard tests of cols" begin df_long = DataFrame(a1=1:3, a2=[1, missing, 3], b1=1.0:3.0, b2=[1.0, missing, 3.0],