diff --git a/NEWS.md b/NEWS.md index f221043..c2ae1af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # TidierData.jl updates +## v16.3 +- Bugfix: `@summary` no longer errors with non-numeric columns. Instead, it only reports non-numeric summary stats on non-numeric columns. Minor changes to summary column names to be lowercase and snakecase. + ## v0.16.2 - 2024-09-03 - Bugfix: `@slice_min` and `@slice_max` respect the `n` argument - Adds `@head` diff --git a/Project.toml b/Project.toml index d7d9e33..bd366a2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.16.2" +version = "0.16.3" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/src/docstrings.jl b/src/docstrings.jl index ae20886..f45a83f 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1026,13 +1026,14 @@ julia> @chain df @pull(2) const docstring_left_join = """ - @left_join(df1, df2, [by]) + @left_join(df1, df2, [join_by]) Perform a left join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1040,7 +1041,23 @@ Perform a left join on `df1` and `df2` with an optional `by`. julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - + +julia> @left_join(df1, df2, join_by(a == a)) +2×3 DataFrame + Row │ a b c + │ String Int64 Int64? +─────┼──────────────────────── + 1 │ a 1 3 + 2 │ b 2 missing + +julia> @left_join(df1, df2, join_by("a"=="a")) +2×3 DataFrame + Row │ a b c + │ String Int64 Int64? +─────┼──────────────────────── + 1 │ a 1 3 + 2 │ b 2 missing + julia> @left_join(df1, df2) 2×3 DataFrame Row │ a b c @@ -1085,13 +1102,14 @@ julia> @left_join(df1, df2, "a" = "a") const docstring_right_join = """ - @right_join(df1, df2, [by]) + @right_join(df1, df2, [join_by]) Perform a right join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1099,7 +1117,23 @@ Perform a right join on `df1` and `df2` with an optional `by`. julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - + +julia> @right_join(df1, df2, join_by(a == a)) +2×3 DataFrame + Row │ a b c + │ String Int64? Int64 +─────┼──────────────────────── + 1 │ a 1 3 + 2 │ c missing 4 + +julia> @right_join(df1, df2, join_by("a"=="a")) +2×3 DataFrame + Row │ a b c + │ String Int64? Int64 +─────┼──────────────────────── + 1 │ a 1 3 + 2 │ c missing 4 + julia> @right_join(df1, df2) 2×3 DataFrame Row │ a b c @@ -1144,13 +1178,14 @@ julia> @right_join(df1, df2, "a" = "a") const docstring_inner_join = """ - @inner_join(df1, df2, [by]) + @inner_join(df1, df2, [join_by]) Perform a inner join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1158,7 +1193,21 @@ Perform a inner join on `df1` and `df2` with an optional `by`. julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - + +julia> @inner_join(df1, df2, join_by(a == a)) +1×3 DataFrame + Row │ a b c + │ String Int64 Int64 +─────┼────────────────────── + 1 │ a 1 3 + +julia> @inner_join(df1, df2, join_by("a"=="a")) +1×3 DataFrame + Row │ a b c + │ String Int64 Int64 +─────┼────────────────────── + 1 │ a 1 3 + julia> @inner_join(df1, df2) 1×3 DataFrame Row │ a b c @@ -1198,13 +1247,15 @@ julia> @inner_join(df1, df2, "a" = "a") const docstring_full_join = """ - @full_join(df1, df2, [by]) + @full_join(df1, df2, [join_by]) Perform a full join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` + - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1212,7 +1263,25 @@ Perform a full join on `df1` and `df2` with an optional `by`. julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - + +julia> @full_join(df1, df2, join_by(a == a)) +3×3 DataFrame + Row │ a b c + │ String Int64? Int64? +─────┼────────────────────────── + 1 │ a 1 3 + 2 │ b 2 missing + 3 │ c missing 4 + +julia> @full_join(df1, df2, join_by("a"=="a")) +3×3 DataFrame + Row │ a b c + │ String Int64? Int64? +─────┼────────────────────────── + 1 │ a 1 3 + 2 │ b 2 missing + 3 │ c missing 4 + julia> @full_join(df1, df2) 3×3 DataFrame Row │ a b c @@ -1262,13 +1331,15 @@ julia> @full_join(df1, df2, "a" = "a") const docstring_anti_join = """ - @anti_join(df1, df2, [by]) + @anti_join(df1, df2, [join_by]) Perform an anti-join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` + - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1276,7 +1347,21 @@ Perform an anti-join on `df1` and `df2` with an optional `by`. julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - + +julia> @anti_join(df1, df2, join_by(a == a)) +1×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ b 2 + +julia> @anti_join(df1, df2, join_by("a"=="a")) +1×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ b 2 + julia> @anti_join(df1, df2) 1×2 DataFrame Row │ a b @@ -1316,13 +1401,15 @@ julia> @anti_join(df1, df2, "a" = "a") const docstring_semi_join = """ - @semi_join(df1, df2, [by]) + @semi_join(df1, df2, [join_by]) Perform an semi-join on `df1` and `df2` with an optional `by`. # Arguments - `df1`: A DataFrame. - `df2`: A DataFrame. +- `join_by`: A function or expression used to specify the join condition. It should be provided in the form of `join_by(column1 == column2)` . It can accept mutliple columns such as `join_by(a==b, c==d)` + - `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. # Examples @@ -1331,6 +1418,20 @@ julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); +julia> @semi_join(df1, df2, join_by(a == a)) +1×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ a 1 + +julia> @semi_join(df1, df2, join_by("a"=="a")) +1×2 DataFrame + Row │ a b + │ String Int64 +─────┼─────────────── + 1 │ a 1 + julia> @semi_join(df1, df2) 1×2 DataFrame Row │ a b @@ -2415,7 +2516,8 @@ For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, media julia> df = DataFrame(a = [1, 2, 3, 4, 5], b = [missing, 7, 8, 9, 10], c = [11, missing, 13, 14, missing], - d = [16, 17, 18, 19, 20]); + d = [16.1, 17.2, 18.3, 19.4, 20.5], + e = ["a", "a", "a", "a", "a"]); julia> @summary(df); diff --git a/src/parsing.jl b/src/parsing.jl index b85c251..7434081 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -216,7 +216,7 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String}) tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr) src = Union{Expr,QuoteNode}[] # type can be either a QuoteNode or a expression containing a selection helper function - + if @capture(tidy_expr, expr_Symbol) push!(src, QuoteNode(expr)) elseif @capture(tidy_expr, expr_String) @@ -229,6 +229,18 @@ function parse_join_by(tidy_expr::Union{Expr,Symbol,String}) lhs = QuoteNode(Symbol(lhs)) rhs = QuoteNode(Symbol(rhs)) push!(src, :($lhs => $rhs)) + elseif tidy_expr isa Expr && tidy_expr.head == :call && tidy_expr.args[1] == :join_by + for arg in tidy_expr.args[2:end] + if arg isa Expr && arg.head == :call && arg.args[1] == Symbol("==") + lhs_arg = arg.args[2] + rhs_arg = arg.args[3] + push!(src, :($(QuoteNode(lhs_arg)) => $(QuoteNode(rhs_arg)))) + elseif isa(arg, Symbol) + push!(src, :($(QuoteNode(arg)) => $(QuoteNode(arg)))) + else + push!(src, arg) + end + end else @capture(tidy_expr, (args__,)) for arg in args diff --git a/src/separate_unite.jl b/src/separate_unite.jl index 52bb2ce..c17cffc 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -211,8 +211,8 @@ function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimite for row in eachrow(temp_df) value = row[column] # Handle missing values and non-string types - if ismissing(value) || typeof(value) != String - push!(expanded_data[column], [value]) + if ismissing(value) || !(value isa AbstractString) + push!(expanded_data[column], [value]) else push!(expanded_data[column], split(value, delimiter)) end diff --git a/src/summary.jl b/src/summary.jl index 68e0f29..893e84e 100644 --- a/src/summary.jl +++ b/src/summary.jl @@ -3,18 +3,37 @@ function summary_stats(df::DataFrame) summary_data = [] for column in colnames col = df[:, column] - col_nonmissing = collect(skipmissing(col)) - push!(summary_data, ( - Column = column, - Min = minimum(col_nonmissing), - Q1 = quantile(col_nonmissing, 0.25), - Median = median(col_nonmissing), - Mean = mean(col_nonmissing), - Q3 = quantile(col_nonmissing, 0.75), - Max = maximum(col_nonmissing), - Count = length(col_nonmissing), - Missing_Count = count(ismissing, col) - )) + if eltype(col) <: Union{Number, Missing} + col_nonmissing = collect(skipmissing(col)) + push!(summary_data, ( + column = column, + min = minimum(col_nonmissing), + q1 = quantile(col_nonmissing, 0.25), + median = median(col_nonmissing), + mean = mean(col_nonmissing), + q3 = quantile(col_nonmissing, 0.75), + max = maximum(col_nonmissing), + non_missing_values = length(col_nonmissing), + missing_values = count(ismissing, col), + total_values = length(col), + unique_values = length(unique(col_nonmissing)) + )) + else + col_nonmissing = collect(skipmissing(col)) + push!(summary_data, ( + column = column, + min = nothing, + q1 = nothing, + median = nothing, + mean = nothing, + q3 = nothing, + max = nothing, + non_missing_values = length(col_nonmissing), + missing_values = count(ismissing, col), + total_values = length(col), + unique_values = length(unique(col_nonmissing)) + )) + end end return DataFrame(summary_data) end