adds extra for sep and remove for unite (#113)

drizk1 · kdpsingh · web-flow · commit ad1e8b53338f · 2024-09-03T10:44:37.000-07:00
* adds extra for sep and remove for unite * switch from `warn` ex to `drop` ex in docstring * add :cat_other, :cat_replace_missing, :cat_recode to donotvec list * fixes `n` slice_min/max bug (#110) * fixes `n` slice_min/max bug * adds `@head` * Clean up documentation in prep for release, bump version to v0.16.2. * Fix doctest. --------- Co-authored-by: Karandeep Singh <karandeep@gmail.com> * Cleaned up docstrings. * Clean up NEWS.md --------- Co-authored-by: Karandeep Singh <karandeep@gmail.com>
diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,9 @@
 # TidierData.jl updates
 
-## v0.16.2 - 2024-08-05
+## v0.16.2 - 2024-09-03
 - Bugfix: `@slice_min` and `@slice_max` respect the `n` argument
 - Adds `@head`
+- Adds `extra` argument for `@separate()` and `remove` argument for `@unite()`
 
 ## v0.16.1 - 2024-06-09
 - Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns.
diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -28,7 +28,7 @@ const code = Ref{Bool}(false) # output DataFrames.jl code?
 const log = Ref{Bool}(false) # output tidylog output? (not yet implemented)
 
 # The global do-not-vectorize "list"
-const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr])
+const not_vectorized = Ref{Vector{Symbol}}([:getindex, :rand, :esc, :Ref, :Set, :Cols, :collect, :(:), :∘, :lag, :lead, :ntile, :repeat, :across, :desc, :mean, :std, :var, :median, :mad, :first, :last, :minimum, :maximum, :sum, :length, :skipmissing, :quantile, :passmissing, :cumsum, :cumprod, :accumulate, :is_float, :is_integer, :is_string, :cat_rev, :cat_relevel, :cat_infreq, :cat_lump, :cat_reorder, :cat_collapse, :cat_lump_min, :cat_lump_prop, :categorical, :as_categorical, :is_categorical, :unique, :iqr, :cat_other, :cat_replace_missing, :cat_recode])
 
 # The global do-not-escape "list"
 # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -2305,15 +2305,16 @@ missing
 
 const docstring_separate = 
 """
-   @separate(df, From, Into, Separator)
+   @separate(df, from, into, sep, extra = "merge")
 
 Separate a string column into mulitiple new columns based on a specified delimter 
 
 # Arguments
 - `df`: A DataFrame
-- `From`: Column that will be split
-- `Into`: New column names, supports [] or ()
-- `Separator`: the string or chacater on which to split
+- `from`: Column that will be split
+- `into`: New column names, supports [] or ()
+- `sep`: the string or character on which to split
+- `extra`: "merge", "warn" and "drop" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. "warn" generates a warning message for dropped values.
 
 # Examples
 ```jldoctest
@@ -2338,26 +2339,57 @@ julia> @chain df begin
    1 │ 1          1          missing    
    2 │ 2          2          missing    
    3 │ 3          3          3
+
+julia> @separate(df, a, (b, c), "-")
+3×2 DataFrame
+ Row │ b          c      
+     │ SubStrin…  String 
+─────┼───────────────────
+   1 │ 1          1
+   2 │ 2          2
+   3 │ 3          3-3
+
+julia> @chain df begin
+         @separate(a, (b, c), "-", extra = "drop")
+       end
+3×2 DataFrame
+ Row │ b          c         
+     │ SubStrin…  SubStrin… 
+─────┼──────────────────────
+   1 │ 1          1
+   2 │ 2          2
+   3 │ 3          3
+
 ```
 """
 
 const docstring_unite = 
 """
-      @unite(df, new_cols, from_cols, sep)
+      @unite(df, new_cols, from_cols, sep, remove = true)
 
 Separate a multiple columns into one new columns using a specific delimter
 
 # Arguments
 - `df`: A DataFrame
 - `new_col`: New column that will recieve the combination
 - `from_cols`: Column names that it will combine, supports [] or ()
-- `sep`: the string or character that will seprate the values in the new column
+- `sep`: the string or character that will separate the values in the new column
+- `remove`: defaults to `true`, removes input columns from data frame
 
 # Examples
 ```jldoctest
 julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
 
 julia> @unite(df, new_col, (b, c, d), "-")
+3×1 DataFrame
+ Row │ new_col 
+     │ String  
+─────┼─────────
+   1 │ 1-1
+   2 │ 2-2
+   3 │ 3-3-3
+   
+julia> @unite(df, new_col, (b, c, d), "-", remove = false)
 3×4 DataFrame
  Row │ b       c       d        new_col 
      │ String  String  String?  String  
@@ -3112,14 +3144,14 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)
 
 const docstring_separate_rows =
 """
-    separate_rows(df, columns..., delimiter)
+    separate_rows(df, columns..., sep)
 
 Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.
 
 # Arguments
 - `df`: A DataFrame
 - `columns`: A column or multiple columns to be split. Can be a mix of integers and column names.
-- `delimiter`: The string or character or regular expression used to split the column values.
+- `sep`: The string or character or regular expression used to split the column values.
 
 # Examples
 ```jldoctest
@@ -3135,7 +3167,7 @@ julia> df = DataFrame(a = 1:3,
    2 │     2  aa;bb;cc  2;3;4   8;9;10
    3 │     3  dd;ee     5;6     11;12
 
-julia> @separate_rows(df, 2, 4, ";" )
+julia> @separate_rows(df, 2, 4, ";")
 6×4 DataFrame
  Row │ a      b          c       d         
      │ Int64  SubStrin…  String  SubStrin… 
@@ -3147,7 +3179,7 @@ julia> @separate_rows(df, 2, 4, ";" )
    5 │     3  dd         5;6     11
    6 │     3  ee         5;6     12
 
-julia> @separate_rows(df, b:d, ";" )
+julia> @separate_rows(df, b:d, ";")
 6×4 DataFrame
  Row │ a      b          c          d         
      │ Int64  SubStrin…  SubStrin…  SubStrin… 
@@ -3163,7 +3195,7 @@ julia> @separate_rows(df, b:d, ";" )
 
 const docstring_unnest_wider =
 """
-    @unnest_wider(df, columns, names_sep=)
+    @unnest_wider(df, columns, names_sep)
 
 Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.
 
@@ -3236,7 +3268,7 @@ julia> @unnest_longer(df, 2)
    3 │     2      3  [7, 8]
    4 │     2      4  [7, 8]
 
-julia> @unnest_longer(df, b:c, indices_include=true)
+julia> @unnest_longer(df, b:c, indices_include = true)
 4×5 DataFrame
  Row │ a      b      c      b_id   c_id  
      │ Int64  Int64  Int64  Int64  Int64 
diff --git a/src/separate_unite.jl b/src/separate_unite.jl
@@ -9,82 +9,126 @@ end
 """
 $docstring_separate
 """
-macro separate(df, from, into, sep)
-    from_quoted = QuoteNode(from)
-    
-    interpolated_into, _, _ = parse_interpolation(into)
-    
-    if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
-        args = QuoteNode.(args)
-        into_expr = :[$(args...)]
-    else
-        into_expr = quote
-            if typeof($interpolated_into) <: Vector{String}
-                Symbol.($interpolated_into)
-            else
-                $interpolated_into
-            end
+macro separate(df, from, into, sep, args...)
+  extra = "merge"
+  for arg in args
+    if isa(arg, Expr) && arg.head == :(=)
+        if arg.args[1] == :extra
+            extra = arg.args[2]
         end
     end
-    
-    return quote
-        separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)))
-    end
+ end
+
+  from_quoted = QuoteNode(from)
+  
+  interpolated_into, _, _ = parse_interpolation(into)
+  
+  if @capture(interpolated_into, (args__,)) || @capture(interpolated_into, [args__])
+      args = QuoteNode.(args)
+      into_expr = :[$(args...)]
+  else
+      into_expr = quote
+          if typeof($interpolated_into) <: Vector{String}
+              Symbol.($interpolated_into)
+          else
+              $interpolated_into
+          end
+      end
+  end
+  
+  return quote
+      separate($(esc(df)), $(from_quoted), $(into_expr), $(esc(sep)); extra=$(esc(extra)))
+  end
 end
 
-function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
+function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}; extra::String = "merge")
   new_df = df[:, :]
   new_cols = map(x -> split(x, sep), new_df[:, col])
   max_cols = maximum(length.(new_cols))
 
-  if length(into) < max_cols
-      error("Not enough names provided in `into` for all split columns.")
+  if length(into) < max_cols && extra == "warn"
+      @warn "Dropping extra split parts that don't fit into the provided `into` columns."
+      max_cols = length(into)
+  elseif length(into) < max_cols && extra == "drop"
+      max_cols = length(into)
+  elseif length(into) < max_cols && extra == "merge"
+      merge = true
+  elseif length(into) < max_cols
+      error("Not enough names provided in \"into\" for all split columns.")
+  else
+      merge = false
   end
 
-  for i in 1:max_cols
-      new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+  for i in 1:length(into)
+      if i < length(into) || (extra == "warn" && i <= max_cols) || (extra == "drop" && i <= max_cols)
+          new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+      elseif i == length(into) && merge
+        new_df[:, into[i]] = map(x -> length(x) >= i ? join(x[i:end], sep) : missing, new_cols)
+      else
+        for i in 1:max_cols
+          new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+      end
+
+      end
   end
 
   new_df = select(new_df, Not(col))
 
   return new_df
 end
 
+
 """
 $docstring_unite
 """
-macro unite(df, new_col, from_cols, sep)
-    new_col_quoted = QuoteNode(new_col)
-    interpolated_from_cols, _, _ = parse_interpolation(from_cols)
-    interpolated_from_cols = parse_tidy(interpolated_from_cols)
+macro unite(df, new_col, from_cols, sep, args...)
+  remove=true
+  for arg in args
+    if isa(arg, Expr) && arg.head == :(=)
+        if arg.args[1] == :remove
+            remove = arg.args[2]
+        end
+    end
+ end
+  new_col_quoted = QuoteNode(new_col)
+  interpolated_from_cols, _, _ = parse_interpolation(from_cols)
+  interpolated_from_cols = parse_tidy(interpolated_from_cols)
 
-    if @capture(interpolated_from_cols, (first_col:last_col))
+  if @capture(interpolated_from_cols, (first_col:last_col))
       from_cols_expr = :($(first_col):$(last_col))
-    elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
+  elseif @capture(interpolated_from_cols, (args__,)) || @capture(interpolated_from_cols, [args__])
       args = QuoteNode.(args)
       from_cols_expr = :[$(args...)]
-    else
+  else
       from_cols_expr = quote
           if typeof($interpolated_from_cols) <: Tuple
               collect(Symbol.($interpolated_from_cols))
           else
-            $interpolated_from_cols
+              $interpolated_from_cols
           end
       end
-    end
-    return quote
-        unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)))
-    end
+  end
+  
+  return quote
+      unite($(esc(df)), $new_col_quoted, [$(from_cols_expr)], $(esc(sep)); remove=$(esc(remove)))
+  end
 end
 
-function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_")
+
+function unite(df::DataFrame, new_col_name::Symbol, columns, sep::String="_"; remove::Bool=true)
   new_df = df[:, :]
   cols_expr = columns isa Expr ? (columns,) : columns
   column_symbols = names(df, Cols(cols_expr...)) 
   new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, column_symbols])]
+  
+  if remove
+      new_df = select(new_df, Not(column_symbols))
+  end
+  
   return new_df
 end
 
+
 """
 $docstring_separate_rows
 """