Skip to content

Commit 33c947e

Browse files
committed
Implement new dupcol keyword that indicates what to do with duplicate columns in joins and DataFrame constructors
1 parent e341cc7 commit 33c947e

File tree

12 files changed

+502
-214
lines changed

12 files changed

+502
-214
lines changed

src/abstractdataframe/abstractdataframe.jl

Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df),
117117

118118
"""
119119
rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol};
120-
makeunique::Bool=false)
120+
makeunique::Bool=false, dupcol::Symbol=:error)
121121
rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
122-
makeunique::Bool=false)
122+
makeunique::Bool=false, dupcol::Symbol=:error)
123123
rename!(df::AbstractDataFrame, (from => to)::Pair...)
124124
rename!(df::AbstractDataFrame, d::AbstractDict)
125125
rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair})
@@ -179,9 +179,9 @@ julia> rename!(df, [:a, :b, :c])
179179
1 │ 1 2 3
180180
181181
julia> rename!(df, [:a, :b, :a])
182-
ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically.
182+
ERROR: ArgumentError: Duplicate variable names: :a. Pass dupcol=:makeunique to make them unique using a suffix automatically.
183183
184-
julia> rename!(df, [:a, :b, :a], makeunique=true)
184+
julia> rename!(df, [:a, :b, :a], dupcol=:makeunique)
185185
1×3 DataFrame
186186
Row │ a b a_1
187187
│ Int64 Int64 Int64
@@ -197,16 +197,16 @@ julia> rename!(uppercase, df)
197197
```
198198
"""
199199
function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol};
200-
makeunique::Bool=false)
201-
rename!(index(df), vals, makeunique=makeunique)
200+
makeunique::Bool=false, dupcol::Symbol=:error)
201+
rename!(index(df), vals, makeunique=makeunique, dupcol=dupcol)
202202
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
203203
_drop_all_nonnote_metadata!(parent(df))
204204
return df
205205
end
206206

207207
function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
208-
makeunique::Bool=false)
209-
rename!(index(df), Symbol.(vals), makeunique=makeunique)
208+
makeunique::Bool=false, dupcol::Symbol=:error)
209+
rename!(index(df), Symbol.(vals), makeunique=makeunique, dupcol=dupcol)
210210
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
211211
_drop_all_nonnote_metadata!(parent(df))
212212
return df
@@ -353,9 +353,9 @@ julia> rename(uppercase, df)
353353
```
354354
"""
355355
rename(df::AbstractDataFrame, vals::AbstractVector{Symbol};
356-
makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
356+
makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol)
357357
rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
358-
makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
358+
makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol)
359359
rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...)
360360
rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df))
361361

@@ -1536,13 +1536,20 @@ end
15361536

15371537
"""
15381538
hcat(df::AbstractDataFrame...;
1539-
makeunique::Bool=false, copycols::Bool=true)
1539+
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
15401540
15411541
Horizontally concatenate data frames.
15421542
15431543
If `makeunique=false` (the default) column names of passed objects must be unique.
15441544
If `makeunique=true` then duplicate column names will be suffixed
15451545
with `_i` (`i` starting at 1 for the first duplicate).
1546+
Deprecated in favor of `dupcol`
1547+
1548+
If `dupcol=:error` (the default) then columns names of passed objects must be unique.
1549+
If `dupcol=:makeunique` then duplicate column names will be suffixed
1550+
with `_i` (`i` starting at 1 for the first duplicate).
1551+
If `dupcol=:update` then duplicate columns names will be combined with the left-hand
1552+
column overwritten by non-missing values from the right hand column(s)
15461553
15471554
If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will
15481555
contain copied columns from the source data frames.
@@ -1575,7 +1582,7 @@ julia> df2 = DataFrame(A=4:6, B=4:6)
15751582
2 │ 5 5
15761583
3 │ 6 6
15771584
1578-
julia> df3 = hcat(df1, df2, makeunique=true)
1585+
julia> df3 = hcat(df1, df2, dupcol=:makeunique)
15791586
3×4 DataFrame
15801587
Row │ A B A_1 B_1
15811588
│ Int64 Int64 Int64 Int64
@@ -1587,32 +1594,32 @@ julia> df3 = hcat(df1, df2, makeunique=true)
15871594
julia> df3.A === df1.A
15881595
false
15891596
1590-
julia> df3 = hcat(df1, df2, makeunique=true, copycols=false);
1597+
julia> df3 = hcat(df1, df2, dupcol=:makeunique, copycols=false);
15911598
15921599
julia> df3.A === df1.A
15931600
true
15941601
```
15951602
"""
1596-
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true)
1603+
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
15971604
df = DataFrame(df, copycols=copycols)
15981605
_drop_all_nonnote_metadata!(df)
15991606
return df
16001607
end
16011608

16021609
# TODO: after deprecation remove AbstractVector methods
1603-
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) =
1604-
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols)
1605-
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) =
1606-
hcat!(x, df, makeunique=makeunique, copycols=copycols)
1610+
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
1611+
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
1612+
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
1613+
hcat!(x, df, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
16071614
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame;
1608-
makeunique::Bool=false, copycols::Bool=true) =
1615+
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
16091616
hcat!(DataFrame(df1, copycols=copycols), df2,
1610-
makeunique=makeunique, copycols=copycols)
1617+
makeunique=makeunique, dupcol=dupcol, copycols=copycols)
16111618
Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame},
16121619
y::Union{AbstractVector, AbstractDataFrame}...;
1613-
makeunique::Bool=false, copycols::Bool=true) =
1614-
hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y...,
1615-
makeunique=makeunique, copycols=copycols)
1620+
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
1621+
hcat!(hcat(df, x, makeunique=makeunique, dupcol=dupcol, copycols=copycols), y...,
1622+
makeunique=makeunique, dupcol=dupcol, copycols=copycols)
16161623

16171624
"""
16181625
vcat(dfs::AbstractDataFrame...;
@@ -2870,6 +2877,10 @@ const INSERTCOLS_ARGUMENTS =
28702877
- `makeunique` : defines what to do if `name` already exists in `df`;
28712878
if it is `false` an error will be thrown; if it is `true` a new unique name will
28722879
be generated by adding a suffix
2880+
- `dupcol` : defines what to do if `name` already exists in `df`;
2881+
if it is :error an error will be thrown; if is :makeunique a new unique name will
2882+
be generated by adding a suffix; if it is :update then the existing column will be
2883+
updated with the non-missing values
28732884
- `copycols` : whether vectors passed as columns should be copied
28742885
28752886
If `val` is an `AbstractRange` then the result of `collect(val)` is inserted.
@@ -2891,7 +2902,7 @@ const INSERTCOLS_ARGUMENTS =
28912902

28922903
"""
28932904
insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...;
2894-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
2905+
after::Bool=false, makeunique::Bool=false, dupcol=:error, copycols::Bool=true)
28952906
28962907
Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref)
28972908
function and return the newly created data frame.
@@ -2922,7 +2933,7 @@ julia> insertcols(df, 1, :b => 'a':'c')
29222933
2 │ b 2
29232934
3 │ c 3
29242935
2925-
julia> insertcols(df, :c => 2:4, :c => 3:5, makeunique=true)
2936+
julia> insertcols(df, :c => 2:4, :c => 3:5, dupcol=:error)
29262937
3×3 DataFrame
29272938
Row │ a c c_1
29282939
│ Int64 Int64 Int64
@@ -2942,13 +2953,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true)
29422953
```
29432954
"""
29442955
insertcols(df::AbstractDataFrame, args...;
2945-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
2956+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
29462957
insertcols!(copy(df), args...;
2947-
after=after, makeunique=makeunique, copycols=copycols)
2958+
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
29482959

29492960
"""
29502961
insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...;
2951-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
2962+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
29522963
29532964
Insert a column into a data frame in place. Return the updated data frame.
29542965
@@ -2979,7 +2990,7 @@ julia> insertcols!(df, 1, :b => 'a':'c')
29792990
2 │ b 2
29802991
3 │ c 3
29812992
2982-
julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true)
2993+
julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, dupcol=:error)
29832994
3×4 DataFrame
29842995
Row │ b c c_1 a
29852996
│ Char Int64 Int64 Int64
@@ -2999,7 +3010,10 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
29993010
```
30003011
"""
30013012
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
3002-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
3013+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
3014+
3015+
dupcol = _dupcol(dupcol, makeunique)
3016+
30033017
if !is_column_insertion_allowed(df)
30043018
throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
30053019
"SubDataFrame created with `:` as column selector"))
@@ -3025,15 +3039,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
30253039
"$(ncol(df)) columns at index $col_ind"))
30263040
end
30273041

3028-
if !makeunique
3042+
if dupcol == :error
30293043
if !allunique(first.(name_cols))
30303044
throw(ArgumentError("Names of columns to be inserted into a data frame " *
3031-
"must be unique when `makeunique=true`"))
3045+
"must be unique when `dupcol=:error`"))
30323046
end
30333047
for (n, _) in name_cols
30343048
if hasproperty(df, n)
30353049
throw(ArgumentError("Column $n is already present in the data frame " *
3036-
"which is not allowed when `makeunique=true`"))
3050+
"which is not allowed when `dupcol=:error`"))
30373051
end
30383052
end
30393053
end
@@ -3103,19 +3117,28 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
31033117
dfp[!, name] = item_new
31043118
else
31053119
if hasproperty(dfp, name)
3106-
@assert makeunique
3107-
k = 1
3108-
while true
3109-
nn = Symbol("$(name)_$k")
3110-
if !hasproperty(dfp, nn)
3111-
name = nn
3112-
break
3120+
if dupcol == :makeunique
3121+
k = 1
3122+
while true
3123+
nn = Symbol("$(name)_$k")
3124+
if !hasproperty(dfp, nn)
3125+
name = nn
3126+
break
3127+
end
3128+
k += 1
31133129
end
3114-
k += 1
3130+
insert!(index(dfp), col_ind, name)
3131+
insert!(_columns(dfp), col_ind, item_new)
3132+
else
3133+
@assert dupcol == :update
3134+
# Just update without adding to index
3135+
dfp[!, name] = _update_missing.(dfp[!, name], item_new)
3136+
col_ind -= 1
31153137
end
3138+
else
3139+
insert!(index(dfp), col_ind, name)
3140+
insert!(_columns(dfp), col_ind, item_new)
31163141
end
3117-
insert!(index(dfp), col_ind, name)
3118-
insert!(_columns(dfp), col_ind, item_new)
31193142
end
31203143
col_ind += 1
31213144
end
@@ -3134,22 +3157,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
31343157
end
31353158

31363159
insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
3137-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
3160+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
31383161
insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
3139-
after=after, makeunique=makeunique, copycols=copycols)
3162+
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
31403163

31413164
insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
3142-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
3165+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
31433166
insertcols!(df, ncol(df)+1, name_cols..., after=after,
3144-
makeunique=makeunique, copycols=copycols)
3167+
makeunique=makeunique, dupcol=dupcol, copycols=copycols)
31453168

31463169
insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
3147-
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
3170+
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
31483171
insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
3149-
after=after, makeunique=makeunique, copycols=copycols)
3172+
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
31503173

31513174
function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
3152-
makeunique::Bool=false, copycols::Bool=true)
3175+
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
31533176
if col isa SymbolOrString
31543177
col_ind = Int(columnindex(df, col))
31553178
if col_ind == 0
@@ -3173,7 +3196,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
31733196
end
31743197

31753198
function insertcols!(df::AbstractDataFrame; after::Bool=false,
3176-
makeunique::Bool=false, copycols::Bool=true)
3199+
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
31773200
_drop_all_nonnote_metadata!(parent(df))
31783201
return df
31793202
end

src/abstractdataframe/reshape.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,7 @@ julia> permutedims(df2, 1, "different_name")
823823
"""
824824
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
825825
dest_namescol::Union{Symbol, AbstractString};
826-
makeunique::Bool=false, strict::Bool=true)
826+
makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true)
827827

828828
if src_namescol isa Integer
829829
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
@@ -854,26 +854,26 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
854854

855855
if ncol(df_notsrc) == 0
856856
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names,
857-
makeunique=makeunique, copycols=false)
857+
makeunique=makeunique, dupcol=dupcol, copycols=false)
858858
else
859859
m = permutedims(Matrix(df_notsrc))
860-
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique)
860+
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, dupcol=dupcol)
861861
end
862-
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
862+
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, dupcol=dupcol, copycols=false)
863863
_copy_table_note_metadata!(out_df, df)
864864
return out_df
865865
end
866866

867867
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
868-
makeunique::Bool=false, strict::Bool=true)
868+
makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true)
869869
if src_namescol isa Integer
870870
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
871871
dest_namescol = _names(df)[src_namescol]
872872
else
873873
dest_namescol = src_namescol
874874
end
875875
return permutedims(df, src_namescol, dest_namescol;
876-
makeunique=makeunique, strict=strict)
876+
makeunique=makeunique, dupcol=dupcol, strict=strict)
877877
end
878878

879879
function Base.permutedims(df::AbstractDataFrame)
@@ -883,8 +883,8 @@ function Base.permutedims(df::AbstractDataFrame)
883883
end
884884

885885
function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector;
886-
makeunique::Bool=false)
887-
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique)
886+
makeunique::Bool=false, dupcol::Symbol=:error)
887+
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, dupcol=dupcol)
888888
_copy_table_note_metadata!(out_df, df)
889889
return out_df
890890
end

0 commit comments

Comments
 (0)