Skip to content

Commit ad1e8b5

Browse files
drizk1kdpsingh
andauthored
adds extra for sep and remove for unite (#113)
* adds extra for sep and remove for unite * switch from `warn` ex to `drop` ex in docstring * add :cat_other, :cat_replace_missing, :cat_recode to donotvec list * fixes `n` slice_min/max bug (#110) * fixes `n` slice_min/max bug * adds `@head` * Clean up documentation in prep for release, bump version to v0.16.2. * Fix doctest. --------- Co-authored-by: Karandeep Singh <[email protected]> * Cleaned up docstrings. * Clean up NEWS.md --------- Co-authored-by: Karandeep Singh <[email protected]>
1 parent 3431859 commit ad1e8b5

File tree

4 files changed

+128
-51
lines changed

4 files changed

+128
-51
lines changed

NEWS.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
# TidierData.jl updates
22

3-
## v0.16.2 - 2024-08-05
3+
## v0.16.2 - 2024-09-03
44
- Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
55
- Adds `@head`
6+
- Adds `extra` argument for `@separate()` and `remove` argument for `@unite()`
67

78
## v0.16.1 - 2024-06-09
89
- Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns.

src/TidierData.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const code = Ref{Bool}(false) # output DataFrames.jl code?
2828
const log = Ref{Bool}(false) # output tidylog output? (not yet implemented)
2929

3030
# The global do-not-vectorize "list"
31-
const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr])
31+
const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode])
3232

3333
# The global do-not-escape "list"
3434
# `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped

src/docstrings.jl

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2305,15 +2305,16 @@ missing
23052305

23062306
const docstring_separate =
23072307
"""
2308-
@separate(df, From, Into, Separator)
2308+
@separate(df, from, into, sep, extra = "merge")
23092309
23102310
Separate a string column into mulitiple new columns based on a specified delimter
23112311
23122312
# Arguments
23132313
- `df`: A DataFrame
2314-
- `From`: Column that will be split
2315-
- `Into`: New column names, supports [] or ()
2316-
- `Separator`: the string or chacater on which to split
2314+
- `from`: Column that will be split
2315+
- `into`: New column names, supports [] or ()
2316+
- `sep`: the string or character on which to split
2317+
- `extra`: "merge", "warn" and "drop" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. "warn" generates a warning message for dropped values.
23172318
23182319
# Examples
23192320
```jldoctest
@@ -2338,26 +2339,57 @@ julia> @chain df begin
23382339
1 │ 1 1 missing
23392340
2 │ 2 2 missing
23402341
3 │ 3 3 3
2342+
2343+
julia> @separate(df, a, (b, c), "-")
2344+
3×2 DataFrame
2345+
Row │ b c
2346+
│ SubStrin… String
2347+
─────┼───────────────────
2348+
1 │ 1 1
2349+
2 │ 2 2
2350+
3 │ 3 3-3
2351+
2352+
julia> @chain df begin
2353+
@separate(a, (b, c), "-", extra = "drop")
2354+
end
2355+
3×2 DataFrame
2356+
Row │ b c
2357+
│ SubStrin… SubStrin…
2358+
─────┼──────────────────────
2359+
1 │ 1 1
2360+
2 │ 2 2
2361+
3 │ 3 3
2362+
23412363
```
23422364
"""
23432365

23442366
const docstring_unite =
23452367
"""
2346-
@unite(df, new_cols, from_cols, sep)
2368+
@unite(df, new_cols, from_cols, sep, remove = true)
23472369
23482370
Separate a multiple columns into one new columns using a specific delimter
23492371
23502372
# Arguments
23512373
- `df`: A DataFrame
23522374
- `new_col`: New column that will recieve the combination
23532375
- `from_cols`: Column names that it will combine, supports [] or ()
2354-
- `sep`: the string or character that will seprate the values in the new column
2376+
- `sep`: the string or character that will separate the values in the new column
2377+
- `remove`: defaults to `true`, removes input columns from data frame
23552378
23562379
# Examples
23572380
```jldoctest
23582381
julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
23592382
23602383
julia> @unite(df, new_col, (b, c, d), "-")
2384+
3×1 DataFrame
2385+
Row │ new_col
2386+
│ String
2387+
─────┼─────────
2388+
1 │ 1-1
2389+
2 │ 2-2
2390+
3 │ 3-3-3
2391+
2392+
julia> @unite(df, new_col, (b, c, d), "-", remove = false)
23612393
3×4 DataFrame
23622394
Row │ b c d new_col
23632395
│ String String String? String
@@ -3112,14 +3144,14 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)
31123144

31133145
const docstring_separate_rows =
31143146
"""
3115-
separate_rows(df, columns..., delimiter)
3147+
separate_rows(df, columns..., sep)
31163148
31173149
Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.
31183150
31193151
# Arguments
31203152
- `df`: A DataFrame
31213153
- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names.
3122-
- `delimiter`: The string or character or regular expression used to split the column values.
3154+
- `sep`: The string or character or regular expression used to split the column values.
31233155
31243156
# Examples
31253157
```jldoctest
@@ -3135,7 +3167,7 @@ julia> df = DataFrame(a = 1:3,
31353167
2 │ 2 aa;bb;cc 2;3;4 8;9;10
31363168
3 │ 3 dd;ee 5;6 11;12
31373169
3138-
julia> @separate_rows(df, 2, 4, ";" )
3170+
julia> @separate_rows(df, 2, 4, ";")
31393171
6×4 DataFrame
31403172
Row │ a b c d
31413173
│ Int64 SubStrin… String SubStrin…
@@ -3147,7 +3179,7 @@ julia> @separate_rows(df, 2, 4, ";" )
31473179
5 │ 3 dd 5;6 11
31483180
6 │ 3 ee 5;6 12
31493181
3150-
julia> @separate_rows(df, b:d, ";" )
3182+
julia> @separate_rows(df, b:d, ";")
31513183
6×4 DataFrame
31523184
Row │ a b c d
31533185
│ Int64 SubStrin… SubStrin… SubStrin…
@@ -3163,7 +3195,7 @@ julia> @separate_rows(df, b:d, ";" )
31633195

31643196
const docstring_unnest_wider =
31653197
"""
3166-
@unnest_wider(df, columns, names_sep=)
3198+
@unnest_wider(df, columns, names_sep)
31673199
31683200
Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.
31693201
@@ -3236,7 +3268,7 @@ julia> @unnest_longer(df, 2)
32363268
3 │ 2 3 [7, 8]
32373269
4 │ 2 4 [7, 8]
32383270
3239-
julia> @unnest_longer(df, b:c, indices_include=true)
3271+
julia> @unnest_longer(df, b:c, indices_include = true)
32403272
4×5 DataFrame
32413273
Row │ a b c b_id c_id
32423274
│ Int64 Int64 Int64 Int64 Int64

src/separate_unite.jl

Lines changed: 81 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,82 +9,126 @@ end
99
"""
1010
$docstring_separate
1111
"""
12-
macro separate(df, from, into, sep)
13-
from_quoted = QuoteNode(from)
14-
15-
interpolated_into, _, _ = parse_interpolation(into)
16-
17-
if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
18-
args = QuoteNode.(args)
19-
into_expr = :[$(args...)]
20-
else
21-
into_expr = quote
22-
if typeof($interpolated_into) <: Vector{String}
23-
Symbol.($interpolated_into)
24-
else
25-
$interpolated_into
26-
end
12+
macro separate(df, from, into, sep, args...)
13+
extra = "merge"
14+
for arg in args
15+
if isa(arg, Expr) && arg.head == :(=)
16+
if arg.args[1] == :extra
17+
extra = arg.args[2]
2718
end
2819
end
29-
30-
return quote
31-
separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)))
32-
end
20+
end
21+
22+
from_quoted = QuoteNode(from)
23+
24+
interpolated_into, _, _ = parse_interpolation(into)
25+
26+
if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
27+
args = QuoteNode.(args)
28+
into_expr = :[$(args...)]
29+
else
30+
into_expr = quote
31+
if typeof($interpolated_into) <: Vector{String}
32+
Symbol.($interpolated_into)
33+
else
34+
$interpolated_into
35+
end
36+
end
37+
end
38+
39+
return quote
40+
separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)); extra=$(esc(extra)))
41+
end
3342
end
3443

35-
function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
44+
function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}; extra::String = "merge")
3645
new_df = df[:, :]
3746
new_cols = map(x -> split(x, sep), new_df[:, col])
3847
max_cols = maximum(length.(new_cols))
3948

40-
if length(into) < max_cols
41-
error("Not enough names provided in `into` for all split columns.")
49+
if length(into) < max_cols && extra == "warn"
50+
@warn "Dropping extra split parts that don't fit into the provided `into` columns."
51+
max_cols = length(into)
52+
elseif length(into) < max_cols && extra == "drop"
53+
max_cols = length(into)
54+
elseif length(into) < max_cols && extra == "merge"
55+
merge = true
56+
elseif length(into) < max_cols
57+
error("Not enough names provided in \"into\" for all split columns.")
58+
else
59+
merge = false
4260
end
4361

44-
for i in 1:max_cols
45-
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
62+
for i in 1:length(into)
63+
if i < length(into) || (extra == "warn" && i <= max_cols) || (extra == "drop" && i <= max_cols)
64+
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
65+
elseif i == length(into) && merge
66+
new_df[:, into[i]] = map(x -> length(x) >= i ? join(x[i:end], sep) : missing, new_cols)
67+
else
68+
for i in 1:max_cols
69+
new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
70+
end
71+
72+
end
4673
end
4774

4875
new_df = select(new_df, Not(col))
4976

5077
return new_df
5178
end
5279

80+
5381
"""
5482
$docstring_unite
5583
"""
56-
macro unite(df, new_col, from_cols, sep)
57-
new_col_quoted = QuoteNode(new_col)
58-
interpolated_from_cols, _, _ = parse_interpolation(from_cols)
59-
interpolated_from_cols = parse_tidy(interpolated_from_cols)
84+
macro unite(df, new_col, from_cols, sep, args...)
85+
remove=true
86+
for arg in args
87+
if isa(arg, Expr) && arg.head == :(=)
88+
if arg.args[1] == :remove
89+
remove = arg.args[2]
90+
end
91+
end
92+
end
93+
new_col_quoted = QuoteNode(new_col)
94+
interpolated_from_cols, _, _ = parse_interpolation(from_cols)
95+
interpolated_from_cols = parse_tidy(interpolated_from_cols)
6096

61-
if @capture(interpolated_from_cols, (first_col:last_col))
97+
if @capture(interpolated_from_cols, (first_col:last_col))
6298
from_cols_expr = :($(first_col):$(last_col))
63-
elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
99+
elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
64100
args = QuoteNode.(args)
65101
from_cols_expr = :[$(args...)]
66-
else
102+
else
67103
from_cols_expr = quote
68104
if typeof($interpolated_from_cols) <: Tuple
69105
collect(Symbol.($interpolated_from_cols))
70106
else
71-
$interpolated_from_cols
107+
$interpolated_from_cols
72108
end
73109
end
74-
end
75-
return quote
76-
unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)))
77-
end
110+
end
111+
112+
return quote
113+
unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)); remove=$(esc(remove)))
114+
end
78115
end
79116

80-
function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_")
117+
118+
function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_"; remove::Bool=true)
81119
new_df = df[:, :]
82120
cols_expr = columns isa Expr ? (columns,) : columns
83121
column_symbols = names(df, Cols(cols_expr...))
84122
new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])]
123+
124+
if remove
125+
new_df = select(new_df, Not(column_symbols))
126+
end
127+
85128
return new_df
86129
end
87130

131+
88132
"""
89133
$docstring_separate_rows
90134
"""

0 commit comments

Comments
 (0)