Skip to content

Commit 4eaef97

Browse files
samukwekusammychoco
andauthored
[ENH] Performance for left/right join when sort_by_appearance is False (#1170)
* perf improvements left/right joins * left/right joins implemented as concat * update doc and logic for sort_by_appearance * changelog * Update conditional_join.ipynb * changelog * Update CHANGELOG.md * Update CHANGELOG.md * Update CHANGELOG.md * changelog * update merge logic * update docs * update based on feedback * updates based on feedback Co-authored-by: sammychoco <[email protected]>
1 parent a8a709e commit 4eaef97

File tree

3 files changed

+175
-105
lines changed

3 files changed

+175
-105
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
- [ENH] Faster computation for a single non-equi join, with a numba engine. Issue #1102 @samukweku
2323
- [TST] Fix testcases failing on Window. Issue #1160 @Zeroto521, and @samukweku
2424
- [INF] Cancel old workflow runs via Github Action `concurrency`. PR #1161 @Zeroto521
25-
- [ENH] Faster computation for non-equi join, with a numba engine. Issue #1102 @samukweku
25+
- [ENH] Faster computation for non-equi join, with a numba engine. Speed improvement for left/right joins when `sort_by_appearance` is False. Issue #1102 @samukweku
2626
- [BUG] Avoid `change_type` mutating original `DataFrame`. PR #1162 @Zeroto521
2727
- [ENH] The parameter `column_name` of `change_type` totally supports inputing multi-column now. #1163 @Zeroto521
2828
- [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku

janitor/functions/conditional_join.py

Lines changed: 51 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ def conditional_join(
5454
performance could be improved by setting `use_numba` to `True`.
5555
This assumes that `numba` is installed.
5656
57+
To preserve row order, set `sort_by_appearance` to `True`.
58+
5759
This function returns rows, if any, where values from `df` meet the
5860
condition(s) for values from `right`. The conditions are passed in
5961
as a variable argument of tuples, where the tuple is of
@@ -129,10 +131,9 @@ def conditional_join(
129131
:param sort_by_appearance: Default is `False`.
130132
This is useful for scenarios where the user wants
131133
the original order maintained.
132-
If True, values from `df` and `right`
133-
that meet the join condition will be returned
134-
in the final dataframe in the same order
135-
that they were before the join.
134+
If `True` and `how = left`, the row order from the left dataframe
135+
is preserved; if `True` and `how = right`, the row order
136+
from the right dataframe is preserved.
136137
:param df_columns: Columns to select from `df`.
137138
It can be a single column or a list of columns.
138139
It is also possible to rename the output columns via a dictionary.
@@ -1254,12 +1255,6 @@ def _create_frame(
12541255
"""
12551256
Create final dataframe
12561257
"""
1257-
if sort_by_appearance:
1258-
sorter = np.lexsort((right_index, left_index))
1259-
right_index = right_index[sorter]
1260-
left_index = left_index[sorter]
1261-
sorter = None
1262-
12631258
if df_columns:
12641259
df = _cond_join_select_columns(df_columns, df)
12651260

@@ -1269,30 +1264,55 @@ def _create_frame(
12691264
if set(df.columns).intersection(right.columns):
12701265
df, right = _create_multiindex_column(df, right)
12711266

1272-
if how == "inner":
1267+
if sort_by_appearance or (left_index.size == 0):
1268+
if how in {"inner", "left"}:
1269+
right = right.take(right_index)
1270+
right.index = left_index
1271+
else:
1272+
df = df.take(left_index)
1273+
df.index = right_index
1274+
df = pd.merge(
1275+
df,
1276+
right,
1277+
left_index=True,
1278+
right_index=True,
1279+
sort=False,
1280+
copy=False,
1281+
how=how,
1282+
)
1283+
df.index = range(len(df))
1284+
return df
1285+
1286+
def _inner(
1287+
df: pd.DataFrame,
1288+
right: pd.DataFrame,
1289+
left_index: pd.DataFrame,
1290+
right_index: pd.DataFrame,
1291+
) -> pd.DataFrame:
1292+
"""Create DataFrame for inner join"""
12731293
df = {key: value._values[left_index] for key, value in df.items()}
12741294
right = {
12751295
key: value._values[right_index] for key, value in right.items()
12761296
}
1277-
return pd.DataFrame({**df, **right}, copy=False)
1297+
df.update(right)
1298+
return pd.DataFrame(df, copy=False)
12781299

1279-
if how == "left":
1280-
right = {
1281-
key: value._values[right_index] for key, value in right.items()
1282-
}
1283-
right = pd.DataFrame(right, index=left_index, copy=False)
1284-
else:
1285-
df = {key: value._values[left_index] for key, value in df.items()}
1286-
df = pd.DataFrame(df, index=right_index, copy=False)
1300+
if how == "inner":
1301+
return _inner(df, right, left_index, right_index)
12871302

1288-
df = pd.merge(
1289-
df,
1290-
right,
1291-
left_index=True,
1292-
right_index=True,
1293-
how=how,
1294-
copy=False,
1295-
sort=False,
1296-
)
1297-
df.index = range(len(df))
1298-
return df
1303+
if how == "left":
1304+
df_ = np.bincount(left_index, minlength=df.index.size) == 0
1305+
df_ = df_.nonzero()[0]
1306+
if not df_.size:
1307+
return _inner(df, right, left_index, right_index)
1308+
df_ = df.take(df_)
1309+
df = _inner(df, right, left_index, right_index)
1310+
return pd.concat([df, df_], ignore_index=True)
1311+
if how == "right":
1312+
right_ = np.bincount(right_index, minlength=right.index.size) == 0
1313+
right_ = right_.nonzero()[0]
1314+
if not right_.size:
1315+
return _inner(df, right, left_index, right_index)
1316+
right_ = right.take(right_)
1317+
right = _inner(df, right, left_index, right_index)
1318+
return pd.concat([right, right_], ignore_index=True)

tests/functions/test_conditional_join.py

Lines changed: 123 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -296,12 +296,16 @@ def test_single_condition_less_than_floats_keep_first(df, right):
296296
allow_exact_matches=False,
297297
)
298298
expected.index = range(len(expected))
299-
actual = df[["B"]].conditional_join(
300-
right[["Numeric"]].sort_values("Numeric"),
301-
("B", "Numeric", "<"),
302-
how="left",
303-
sort_by_appearance=False,
304-
keep="first",
299+
actual = (
300+
df[["B"]]
301+
.conditional_join(
302+
right[["Numeric"]].sort_values("Numeric"),
303+
("B", "Numeric", "<"),
304+
how="left",
305+
sort_by_appearance=False,
306+
keep="first",
307+
)
308+
.sort_values(["B", "Numeric"], ignore_index=True)
305309
)
306310

307311
assert_frame_equal(expected, actual)
@@ -324,12 +328,16 @@ def test_single_condition_less_than_floats_keep_last(df, right):
324328
allow_exact_matches=False,
325329
)
326330
expected.index = range(len(expected))
327-
actual = df[["B"]].conditional_join(
328-
right[["Numeric"]],
329-
("B", "Numeric", ">"),
330-
how="left",
331-
sort_by_appearance=False,
332-
keep="last",
331+
actual = (
332+
df[["B"]]
333+
.conditional_join(
334+
right[["Numeric"]],
335+
("B", "Numeric", ">"),
336+
how="left",
337+
sort_by_appearance=False,
338+
keep="last",
339+
)
340+
.sort_values(["B", "Numeric"], ignore_index=True)
333341
)
334342

335343
assert_frame_equal(expected, actual)
@@ -378,13 +386,17 @@ def test_single_condition_less_than_floats_keep_first_numba(df, right):
378386
allow_exact_matches=False,
379387
)
380388
expected.index = range(len(expected))
381-
actual = df[["B"]].conditional_join(
382-
right[["Numeric"]],
383-
("B", "Numeric", "<"),
384-
how="left",
385-
sort_by_appearance=False,
386-
keep="first",
387-
use_numba=True,
389+
actual = (
390+
df[["B"]]
391+
.conditional_join(
392+
right[["Numeric"]],
393+
("B", "Numeric", "<"),
394+
how="left",
395+
sort_by_appearance=False,
396+
keep="first",
397+
use_numba=True,
398+
)
399+
.sort_values(["B", "Numeric"], ignore_index=True)
388400
)
389401

390402
assert_frame_equal(expected, actual)
@@ -405,15 +417,21 @@ def test_single_condition_less_than_floats_keep_last_numba(df, right):
405417
right_on="Numeric",
406418
direction="backward",
407419
allow_exact_matches=False,
408-
)
420+
).sort_values(["B", "Numeric"], ascending=[True, False], ignore_index=True)
409421
expected.index = range(len(expected))
410-
actual = df[["B"]].conditional_join(
411-
right[["Numeric"]],
412-
("B", "Numeric", ">"),
413-
how="left",
414-
sort_by_appearance=False,
415-
keep="last",
416-
use_numba=True,
422+
actual = (
423+
df[["B"]]
424+
.conditional_join(
425+
right[["Numeric"]],
426+
("B", "Numeric", ">"),
427+
how="left",
428+
sort_by_appearance=False,
429+
keep="last",
430+
use_numba=True,
431+
)
432+
.sort_values(
433+
["B", "Numeric"], ascending=[True, False], ignore_index=True
434+
)
417435
)
418436

419437
assert_frame_equal(expected, actual)
@@ -1170,13 +1188,17 @@ def test_how_left(df, right):
11701188
expected = (
11711189
df[["A"]]
11721190
.join(expected[["Integers"]], how="left", sort=False)
1191+
.sort_values(["A", "Integers"], ignore_index=True)
11731192
.reset_index(drop=True)
11741193
)
1175-
actual = df[["A"]].conditional_join(
1176-
right[["Integers"]],
1177-
("A", "Integers", "<="),
1178-
how="left",
1179-
sort_by_appearance=True,
1194+
actual = (
1195+
df[["A"]]
1196+
.conditional_join(
1197+
right[["Integers"]],
1198+
("A", "Integers", "<="),
1199+
how="left",
1200+
)
1201+
.sort_values(["A", "Integers"], ignore_index=True)
11801202
)
11811203

11821204
assert_frame_equal(expected, actual)
@@ -1196,13 +1218,17 @@ def test_how_right(df, right):
11961218
expected = (
11971219
expected[["E"]]
11981220
.join(right[["Dates"]], how="right", sort=False)
1221+
.sort_values(["E", "Dates"], ignore_index=True)
11991222
.reset_index(drop=True)
12001223
)
1201-
actual = df[["E"]].conditional_join(
1202-
right[["Dates"]],
1203-
("E", "Dates", ">"),
1204-
how="right",
1205-
sort_by_appearance=True,
1224+
actual = (
1225+
df[["E"]]
1226+
.conditional_join(
1227+
right[["Dates"]],
1228+
("E", "Dates", ">"),
1229+
how="right",
1230+
)
1231+
.sort_values(["E", "Dates"], ignore_index=True)
12061232
)
12071233

12081234
assert_frame_equal(expected, actual)
@@ -1503,14 +1529,18 @@ def test_dual_conditions_gt_and_lt_numbers_left_join(df, right):
15031529
df[["B"]]
15041530
.join(expected[["Numeric", "Floats"]], how="left", sort=False)
15051531
.reset_index(drop=True)
1506-
)
1532+
).sort_values(["B", "Numeric", "Floats"], ignore_index=True)
15071533

1508-
actual = df[["B"]].conditional_join(
1509-
right[["Numeric", "Floats"]],
1510-
("B", "Numeric", ">"),
1511-
("B", "Floats", "<"),
1512-
how="left",
1513-
sort_by_appearance=True,
1534+
actual = (
1535+
df[["B"]]
1536+
.conditional_join(
1537+
right[["Numeric", "Floats"]],
1538+
("B", "Numeric", ">"),
1539+
("B", "Floats", "<"),
1540+
how="left",
1541+
sort_by_appearance=True,
1542+
)
1543+
.sort_values(["B", "Numeric", "Floats"], ignore_index=True)
15141544
)
15151545

15161546
assert_frame_equal(expected, actual)
@@ -1539,15 +1569,19 @@ def test_dual_conditions_gt_and_lt_numbers_right_join(df, right):
15391569
expected = (
15401570
expected[["B"]]
15411571
.join(right[["Numeric", "Floats"]], how="right", sort=False)
1572+
.sort_values(["Numeric", "Floats", "B"], ignore_index=True)
15421573
.reset_index(drop=True)
15431574
)
15441575

1545-
actual = df[["B"]].conditional_join(
1546-
right[["Numeric", "Floats"]],
1547-
("B", "Numeric", ">"),
1548-
("B", "Floats", "<"),
1549-
how="right",
1550-
sort_by_appearance=True,
1576+
actual = (
1577+
df[["B"]]
1578+
.conditional_join(
1579+
right[["Numeric", "Floats"]],
1580+
("B", "Numeric", ">"),
1581+
("B", "Floats", "<"),
1582+
how="right",
1583+
)
1584+
.sort_values(["Numeric", "Floats", "B"], ignore_index=True)
15511585
)
15521586
assert_frame_equal(expected, actual)
15531587

@@ -1564,18 +1598,26 @@ def test_dual_ne_extension(df, right):
15641598
df = df.astype({"A": "Int64"})
15651599
right = right.astype({"Integers": "Int64"})
15661600
expected = df.merge(right, how="cross")
1567-
expected = expected.loc[
1568-
expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
1569-
filters,
1570-
].reset_index(drop=True)
1601+
expected = (
1602+
expected.loc[
1603+
expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
1604+
filters,
1605+
]
1606+
.reset_index(drop=True)
1607+
.sort_values(filters, ignore_index=True)
1608+
)
15711609

1572-
actual = df.conditional_join(
1573-
right,
1574-
("A", "Integers", "!="),
1575-
("B", "Numeric", "!="),
1576-
how="inner",
1577-
sort_by_appearance=True,
1578-
).filter(filters)
1610+
actual = (
1611+
df.conditional_join(
1612+
right,
1613+
("A", "Integers", "!="),
1614+
("B", "Numeric", "!="),
1615+
how="inner",
1616+
sort_by_appearance=True,
1617+
)
1618+
.filter(filters)
1619+
.sort_values(filters, ignore_index=True)
1620+
)
15791621
assert_frame_equal(expected, actual)
15801622

15811623

@@ -1623,19 +1665,27 @@ def test_dual_ne_numba_extension(df, right):
16231665
df = df.astype({"A": "Int64"})
16241666
right = right.astype({"Integers": "Int64"})
16251667
expected = df.merge(right, how="cross")
1626-
expected = expected.loc[
1627-
expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
1628-
filters,
1629-
].reset_index(drop=True)
1668+
expected = (
1669+
expected.loc[
1670+
expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
1671+
filters,
1672+
]
1673+
.reset_index(drop=True)
1674+
.sort_values(filters, ignore_index=True)
1675+
)
16301676

1631-
actual = df.conditional_join(
1632-
right,
1633-
("A", "Integers", "!="),
1634-
("B", "Numeric", "!="),
1635-
how="inner",
1636-
use_numba=True,
1637-
sort_by_appearance=True,
1638-
).filter(filters)
1677+
actual = (
1678+
df.conditional_join(
1679+
right,
1680+
("A", "Integers", "!="),
1681+
("B", "Numeric", "!="),
1682+
how="inner",
1683+
use_numba=True,
1684+
sort_by_appearance=True,
1685+
)
1686+
.filter(filters)
1687+
.sort_values(filters, ignore_index=True)
1688+
)
16391689
assert_frame_equal(expected, actual)
16401690

16411691

0 commit comments

Comments
 (0)