Skip to content

Commit 8dcccb4

Browse files
authored
Bk/add leftjoin! (#2843)
1 parent 9d0914f commit 8dcccb4

File tree

6 files changed

+601
-127
lines changed

6 files changed

+601
-127
lines changed

NEWS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
(with `false` default) that specifies if columns should be inserted after
5555
or before `col`.
5656
([#2829](https://github.com/JuliaData/DataFrames.jl/pull/2829))
57+
* `leftjoin!` performing a left join of two data frame objects by updating the
58+
left data frame with the joined columns from right data frame.
59+
([#2843](https://github.com/JuliaData/DataFrames.jl/pull/2843))
5760

5861
## Bug fixes
5962

docs/src/lib/functions.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ antijoin
109109
crossjoin
110110
innerjoin
111111
leftjoin
112+
leftjoin!
112113
outerjoin
113114
rightjoin
114115
semijoin

src/DataFrames.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export AbstractDataFrame,
5555
innerjoin,
5656
insertcols!,
5757
leftjoin,
58+
leftjoin!,
5859
mapcols,
5960
mapcols!,
6061
ncol,
@@ -134,6 +135,7 @@ include("abstractdataframe/reshape.jl")
134135

135136
include("join/composer.jl")
136137
include("join/core.jl")
138+
include("join/inplace.jl")
137139

138140
include("groupeddataframe/splitapplycombine.jl")
139141
include("groupeddataframe/callprocessing.jl")

src/join/composer.jl

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,6 @@ struct DataFrameJoiner
9292
end
9393
end
9494

95-
# helper map between the row indices in original and joined table
96-
struct RowIndexMap
97-
"row indices in the original table"
98-
orig::Vector{Int}
99-
"row indices in the resulting joined table"
100-
join::Vector{Int}
101-
end
102-
103-
Base.length(x::RowIndexMap) = length(x.orig)
104-
10595
_rename_cols(old_names::AbstractVector{Symbol},
10696
renamecols::Union{Function, Symbol, AbstractString},
10797
exclude::AbstractVector{Symbol} = Symbol[]) =
@@ -332,9 +322,6 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
332322
joiner = DataFrameJoiner(df1, df2, on, matchmissing, kind)
333323

334324
# Check merge key validity
335-
left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
336-
right_invalid = validate[2] ? any(nonunique(joiner.dfr, joiner.right_on)) : false
337-
338325
if validate[1]
339326
non_unique_left = nonunique(joiner.dfl, joiner.left_on)
340327
if any(non_unique_left)
@@ -441,7 +428,7 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
441428
try_idx = 0
442429
while hasproperty(joined, unique_indicator)
443430
try_idx += 1
444-
unique_indicator = Symbol(string(indicator, "_", try_idx))
431+
unique_indicator = Symbol(indicator, "_", try_idx)
445432
end
446433
end
447434

@@ -628,8 +615,8 @@ change in future releases.
628615
if `true`, duplicate names will be suffixed with `_i`
629616
(`i` starting at 1 for the first duplicate).
630617
- `source` : Default: `nothing`. If a `Symbol` or string, adds indicator
631-
column with the given name, for whether a row appeared in only `df1` (`"left_only"`),
632-
only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use,
618+
column with the given name, for whether a row appeared in only `df1` (`"left_only"`)
619+
or in both (`"both"`). If the name is already in use,
633620
the column name will be modified if `makeunique=true`.
634621
- `validate` : whether to check that columns passed as the `on` argument
635622
define unique keys in each input data frame (according to `isequal`).
@@ -775,8 +762,8 @@ change in future releases.
775762
if `true`, duplicate names will be suffixed with `_i`
776763
(`i` starting at 1 for the first duplicate).
777764
- `source` : Default: `nothing`. If a `Symbol` or string, adds indicator
778-
column with the given name for whether a row appeared in only `df1` (`"left_only"`),
779-
only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use,
765+
column with the given name for whether a row appeared in only `df2` (`"right_only"`)
766+
or in both (`"both"`). If the name is already in use,
780767
the column name will be modified if `makeunique=true`.
781768
- `validate` : whether to check that columns passed as the `on` argument
782769
define unique keys in each input data frame (according to `isequal`).

src/join/inplace.jl

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""
2+
leftjoin!(df1, df2; on, makeunique=false, source=nothing,
3+
matchmissing=:error)
4+
5+
6+
Perform a left join of two data frame objects by updating the `df1` with the
7+
joined columns from `df2`.
8+
9+
A left join includes all rows from `df1`.
10+
Rows and columns from `df1` are left untouched.
11+
Each row in `df1` must have at most one match in `df2` based on `on` columns.
12+
13+
# Arguments
14+
- `df1`, `df2`: the `AbstractDataFrames` to be joined
15+
16+
# Keyword Arguments
17+
- `on` : A column name to join `df1` and `df2` on. If the columns on which
18+
`df1` and `df2` will be joined have different names, then a `left=>right`
19+
pair can be passed. It is also allowed to perform a join on multiple columns,
20+
in which case a vector of column names or column name pairs can be passed
21+
(mixing names and pairs is allowed).
22+
- `makeunique` : if `false` (the default), an error will be raised
23+
if duplicate names are found in columns not joined on;
24+
if `true`, duplicate names will be suffixed with `_i`
25+
(`i` starting at 1 for the first duplicate).
26+
- `source` : Default: `nothing`. If a `Symbol` or string, adds indicator
27+
column with the given name, for whether a row appeared in only `df1` (`"left_only"`)
28+
or in both (`"both"`). If the name is already in use,
29+
the column name will be modified if `makeunique=true`.
30+
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
31+
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
32+
matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
33+
`isequal` is used for comparisons of rows for equality
34+
35+
The columns added to `df1` from `df2` will support missing values.
36+
37+
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
38+
imaginary part of the number. If you need to perform a join on such values use
39+
CategoricalArrays.jl and transform a column containing such values into a
40+
`CategoricalVector`.
41+
42+
See also: [`leftjoin`](@ref).
43+
44+
# Examples
45+
```jldoctest
46+
julia> name = DataFrame(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Joe Blogs"])
47+
3×2 DataFrame
48+
Row │ ID Name
49+
│ Int64 String
50+
─────┼──────────────────
51+
1 │ 1 John Doe
52+
2 │ 2 Jane Doe
53+
3 │ 3 Joe Blogs
54+
55+
julia> job = DataFrame(ID = [1, 2, 4], Job = ["Lawyer", "Doctor", "Farmer"])
56+
3×2 DataFrame
57+
Row │ ID Job
58+
│ Int64 String
59+
─────┼───────────────
60+
1 │ 1 Lawyer
61+
2 │ 2 Doctor
62+
3 │ 4 Farmer
63+
64+
julia> leftjoin!(name, job, on = :ID)
65+
3×3 DataFrame
66+
Row │ ID Name Job
67+
│ Int64 String String?
68+
─────┼───────────────────────────
69+
1 │ 1 John Doe Lawyer
70+
2 │ 2 Jane Doe Doctor
71+
3 │ 3 Joe Blogs missing
72+
73+
julia> job2 = DataFrame(identifier = [1, 2, 4], Job = ["Lawyer", "Doctor", "Farmer"])
74+
3×2 DataFrame
75+
Row │ identifier Job
76+
│ Int64 String
77+
─────┼────────────────────
78+
1 │ 1 Lawyer
79+
2 │ 2 Doctor
80+
3 │ 4 Farmer
81+
82+
julia> leftjoin!(name, job2, on = :ID => :identifier, makeunique=true, source=:source)
83+
3×5 DataFrame
84+
Row │ ID Name Job Job_1 source
85+
│ Int64 String String? String? String
86+
─────┼───────────────────────────────────────────────
87+
1 │ 1 John Doe Lawyer Lawyer both
88+
2 │ 2 Jane Doe Doctor Doctor both
89+
3 │ 3 Joe Blogs missing missing left_only
90+
```
91+
"""
92+
function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame;
93+
on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false,
94+
source::Union{Nothing, Symbol, AbstractString}=nothing,
95+
matchmissing::Symbol=:error)
96+
97+
_check_consistency(df1)
98+
_check_consistency(df2)
99+
100+
if !is_column_insertion_allowed(df1)
101+
throw(ArgumentError("leftjoin! is only supported if `df1` is a `DataFrame`, " *
102+
"or a SubDataFrame created with `:` as column selector"))
103+
end
104+
105+
if on == []
106+
throw(ArgumentError("Missing join argument 'on'."))
107+
end
108+
109+
joiner = DataFrameJoiner(df1, df2, on, matchmissing, :left)
110+
111+
right_noon_names = names(joiner.dfr, Not(joiner.right_on))
112+
if !(makeunique || isempty(intersect(right_noon_names, names(df1))))
113+
throw(ArgumentError("the following columns are present in both " *
114+
"left and right data frames but not listed in `on`: " *
115+
join(intersect(right_noon_names, names(df1)), ", ") *
116+
". Pass makeunique=true to add a suffix automatically to " *
117+
"columns names from the right data frame."))
118+
end
119+
120+
left_ixs_inner, right_ixs_inner = find_inner_rows(joiner)
121+
122+
right_ixs = _map_leftjoin_ixs(nrow(df1), left_ixs_inner, right_ixs_inner)
123+
124+
# TODO: consider adding threading support in the future
125+
for colname in right_noon_names
126+
rcol = joiner.dfr[!, colname] # note that joiner.dfr does not have to be df2
127+
rcol_joined = compose_joined_rcol!(rcol, similar_missing(rcol, nrow(df1)),
128+
right_ixs)
129+
# if df1 isa SubDataFrame we must copy columns
130+
insertcols!(df1, colname => rcol_joined, makeunique=makeunique,
131+
copycols=!(df1 isa DataFrame))
132+
end
133+
134+
if source !== nothing
135+
pool = ["left_only", "right_only", "both"]
136+
invpool = Dict{String, UInt32}("left_only" => 1,
137+
"right_only" => 2,
138+
"both" => 3)
139+
indicatorcol = PooledArray(PooledArrays.RefArray(UInt32.(2 .* (right_ixs .> 0) .+ 1)),
140+
invpool, pool)
141+
142+
unique_indicator = source
143+
if makeunique
144+
try_idx = 0
145+
while hasproperty(df1, unique_indicator)
146+
try_idx += 1
147+
unique_indicator = Symbol(source, "_", try_idx)
148+
end
149+
end
150+
151+
if hasproperty(df1, unique_indicator)
152+
throw(ArgumentError("joined data frame already has column " *
153+
":$unique_indicator. Pass makeunique=true to " *
154+
"make it unique using a suffix automatically."))
155+
end
156+
df1[!, unique_indicator] = indicatorcol
157+
end
158+
return df1
159+
end
160+
161+
function _map_leftjoin_ixs(out_len::Int,
162+
left_ixs_inner::Vector{Int},
163+
right_ixs_inner::Vector{Int})
164+
right_ixs = zeros(Int, out_len)
165+
@inbounds for (li, ri) in zip(left_ixs_inner, right_ixs_inner)
166+
if right_ixs[li] > 0
167+
throw(ArgumentError("duplicate rows found in right table"))
168+
end
169+
right_ixs[li] = ri
170+
end
171+
return right_ixs
172+
end
173+
174+
function compose_joined_rcol!(rcol::AbstractVector,
175+
rcol_joined::AbstractVector,
176+
right_ixs::Vector{Int})
177+
@assert length(rcol_joined) == length(right_ixs)
178+
@inbounds for (i, idx) in enumerate(right_ixs)
179+
if idx > 0
180+
rcol_joined[i] = rcol[idx]
181+
end
182+
end
183+
return rcol_joined
184+
end

0 commit comments

Comments
 (0)