[ENH] Performance for left/right join when sort_by_appearance is False (#1170)

samukweku · sammychoco · web-flow · commit 4eaef97f9223 · 2022-09-24T14:36:44.000-04:00
* perf improvements left/right joins

* left/right joins implemented as concat

* update doc and logic for sort_by_appearance

* changelog

* Update conditional_join.ipynb

* changelog

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update CHANGELOG.md

* changelog

* update merge logic

* update docs

* update based on feedback

* updates based on feedback

Co-authored-by: sammychoco &lt;samuel.oranyeli@slalom.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,7 @@
 -   [ENH] Faster computation for a single non-equi join, with a numba engine. Issue #1102 @samukweku
 -   [TST] Fix testcases failing on Window. Issue #1160 @Zeroto521, and @samukweku
 -   [INF] Cancel old workflow runs via Github Action `concurrency`. PR #1161 @Zeroto521
--   [ENH] Faster computation for non-equi join, with a numba engine. Issue #1102 @samukweku
+-   [ENH] Faster computation for non-equi join, with a numba engine. Speed improvement for left/right joins when `sort_by_appearance` is False. Issue #1102 @samukweku
 -   [BUG] Avoid `change_type` mutating original `DataFrame`. PR #1162 @Zeroto521
 -   [ENH] The parameter `column_name` of `change_type` totally supports inputing multi-column now. #1163 @Zeroto521
 -   [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
@@ -54,6 +54,8 @@ def conditional_join(
     performance could be improved by setting `use_numba` to `True`.
     This assumes that `numba` is installed.
 
+    To preserve row order, set `sort_by_appearance` to `True`.
+
     This function returns rows, if any, where values from `df` meet the
     condition(s) for values from `right`. The conditions are passed in
     as a variable argument of tuples, where the tuple is of
@@ -129,10 +131,9 @@ def conditional_join(
     :param sort_by_appearance: Default is `False`.
         This is useful for scenarios where the user wants
         the original order maintained.
-        If True, values from `df` and `right`
-        that meet the join condition will be returned
-        in the final dataframe in the same order
-        that they were before the join.
+        If `True` and `how = left`, the row order from the left dataframe
+        is preserved; if `True` and `how = right`, the row order
+        from the right dataframe is preserved.
     :param df_columns: Columns to select from `df`.
         It can be a single column or a list of columns.
         It is also possible to rename the output columns via a dictionary.
@@ -1254,12 +1255,6 @@ def _create_frame(
     """
     Create final dataframe
     """
-    if sort_by_appearance:
-        sorter = np.lexsort((right_index, left_index))
-        right_index = right_index[sorter]
-        left_index = left_index[sorter]
-        sorter = None
-
     if df_columns:
         df = _cond_join_select_columns(df_columns, df)
 
@@ -1269,30 +1264,55 @@ def _create_frame(
     if set(df.columns).intersection(right.columns):
         df, right = _create_multiindex_column(df, right)
 
-    if how == "inner":
+    if sort_by_appearance or (left_index.size == 0):
+        if how in {"inner", "left"}:
+            right = right.take(right_index)
+            right.index = left_index
+        else:
+            df = df.take(left_index)
+            df.index = right_index
+        df = pd.merge(
+            df,
+            right,
+            left_index=True,
+            right_index=True,
+            sort=False,
+            copy=False,
+            how=how,
+        )
+        df.index = range(len(df))
+        return df
+
+    def _inner(
+        df: pd.DataFrame,
+        right: pd.DataFrame,
+        left_index: pd.DataFrame,
+        right_index: pd.DataFrame,
+    ) -> pd.DataFrame:
+        """Create DataFrame for inner join"""
         df = {key: value._values[left_index] for key, value in df.items()}
         right = {
             key: value._values[right_index] for key, value in right.items()
         }
-        return pd.DataFrame({**df, **right}, copy=False)
+        df.update(right)
+        return pd.DataFrame(df, copy=False)
 
-    if how == "left":
-        right = {
-            key: value._values[right_index] for key, value in right.items()
-        }
-        right = pd.DataFrame(right, index=left_index, copy=False)
-    else:
-        df = {key: value._values[left_index] for key, value in df.items()}
-        df = pd.DataFrame(df, index=right_index, copy=False)
+    if how == "inner":
+        return _inner(df, right, left_index, right_index)
 
-    df = pd.merge(
-        df,
-        right,
-        left_index=True,
-        right_index=True,
-        how=how,
-        copy=False,
-        sort=False,
-    )
-    df.index = range(len(df))
-    return df
+    if how == "left":
+        df_ = np.bincount(left_index, minlength=df.index.size) == 0
+        df_ = df_.nonzero()[0]
+        if not df_.size:
+            return _inner(df, right, left_index, right_index)
+        df_ = df.take(df_)
+        df = _inner(df, right, left_index, right_index)
+        return pd.concat([df, df_], ignore_index=True)
+    if how == "right":
+        right_ = np.bincount(right_index, minlength=right.index.size) == 0
+        right_ = right_.nonzero()[0]
+        if not right_.size:
+            return _inner(df, right, left_index, right_index)
+        right_ = right.take(right_)
+        right = _inner(df, right, left_index, right_index)
+        return pd.concat([right, right_], ignore_index=True)
diff --git a/tests/functions/test_conditional_join.py b/tests/functions/test_conditional_join.py
@@ -296,12 +296,16 @@ def test_single_condition_less_than_floats_keep_first(df, right):
         allow_exact_matches=False,
     )
     expected.index = range(len(expected))
-    actual = df[["B"]].conditional_join(
-        right[["Numeric"]].sort_values("Numeric"),
-        ("B", "Numeric", "<"),
-        how="left",
-        sort_by_appearance=False,
-        keep="first",
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric"]].sort_values("Numeric"),
+            ("B", "Numeric", "<"),
+            how="left",
+            sort_by_appearance=False,
+            keep="first",
+        )
+        .sort_values(["B", "Numeric"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -324,12 +328,16 @@ def test_single_condition_less_than_floats_keep_last(df, right):
         allow_exact_matches=False,
     )
     expected.index = range(len(expected))
-    actual = df[["B"]].conditional_join(
-        right[["Numeric"]],
-        ("B", "Numeric", ">"),
-        how="left",
-        sort_by_appearance=False,
-        keep="last",
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric"]],
+            ("B", "Numeric", ">"),
+            how="left",
+            sort_by_appearance=False,
+            keep="last",
+        )
+        .sort_values(["B", "Numeric"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -378,13 +386,17 @@ def test_single_condition_less_than_floats_keep_first_numba(df, right):
         allow_exact_matches=False,
     )
     expected.index = range(len(expected))
-    actual = df[["B"]].conditional_join(
-        right[["Numeric"]],
-        ("B", "Numeric", "<"),
-        how="left",
-        sort_by_appearance=False,
-        keep="first",
-        use_numba=True,
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric"]],
+            ("B", "Numeric", "<"),
+            how="left",
+            sort_by_appearance=False,
+            keep="first",
+            use_numba=True,
+        )
+        .sort_values(["B", "Numeric"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -405,15 +417,21 @@ def test_single_condition_less_than_floats_keep_last_numba(df, right):
         right_on="Numeric",
         direction="backward",
         allow_exact_matches=False,
-    )
+    ).sort_values(["B", "Numeric"], ascending=[True, False], ignore_index=True)
     expected.index = range(len(expected))
-    actual = df[["B"]].conditional_join(
-        right[["Numeric"]],
-        ("B", "Numeric", ">"),
-        how="left",
-        sort_by_appearance=False,
-        keep="last",
-        use_numba=True,
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric"]],
+            ("B", "Numeric", ">"),
+            how="left",
+            sort_by_appearance=False,
+            keep="last",
+            use_numba=True,
+        )
+        .sort_values(
+            ["B", "Numeric"], ascending=[True, False], ignore_index=True
+        )
     )
 
     assert_frame_equal(expected, actual)
@@ -1170,13 +1188,17 @@ def test_how_left(df, right):
     expected = (
         df[["A"]]
         .join(expected[["Integers"]], how="left", sort=False)
+        .sort_values(["A", "Integers"], ignore_index=True)
         .reset_index(drop=True)
     )
-    actual = df[["A"]].conditional_join(
-        right[["Integers"]],
-        ("A", "Integers", "<="),
-        how="left",
-        sort_by_appearance=True,
+    actual = (
+        df[["A"]]
+        .conditional_join(
+            right[["Integers"]],
+            ("A", "Integers", "<="),
+            how="left",
+        )
+        .sort_values(["A", "Integers"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -1196,13 +1218,17 @@ def test_how_right(df, right):
     expected = (
         expected[["E"]]
         .join(right[["Dates"]], how="right", sort=False)
+        .sort_values(["E", "Dates"], ignore_index=True)
         .reset_index(drop=True)
     )
-    actual = df[["E"]].conditional_join(
-        right[["Dates"]],
-        ("E", "Dates", ">"),
-        how="right",
-        sort_by_appearance=True,
+    actual = (
+        df[["E"]]
+        .conditional_join(
+            right[["Dates"]],
+            ("E", "Dates", ">"),
+            how="right",
+        )
+        .sort_values(["E", "Dates"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -1503,14 +1529,18 @@ def test_dual_conditions_gt_and_lt_numbers_left_join(df, right):
         df[["B"]]
         .join(expected[["Numeric", "Floats"]], how="left", sort=False)
         .reset_index(drop=True)
-    )
+    ).sort_values(["B", "Numeric", "Floats"], ignore_index=True)
 
-    actual = df[["B"]].conditional_join(
-        right[["Numeric", "Floats"]],
-        ("B", "Numeric", ">"),
-        ("B", "Floats", "<"),
-        how="left",
-        sort_by_appearance=True,
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric", "Floats"]],
+            ("B", "Numeric", ">"),
+            ("B", "Floats", "<"),
+            how="left",
+            sort_by_appearance=True,
+        )
+        .sort_values(["B", "Numeric", "Floats"], ignore_index=True)
     )
 
     assert_frame_equal(expected, actual)
@@ -1539,15 +1569,19 @@ def test_dual_conditions_gt_and_lt_numbers_right_join(df, right):
     expected = (
         expected[["B"]]
         .join(right[["Numeric", "Floats"]], how="right", sort=False)
+        .sort_values(["Numeric", "Floats", "B"], ignore_index=True)
         .reset_index(drop=True)
     )
 
-    actual = df[["B"]].conditional_join(
-        right[["Numeric", "Floats"]],
-        ("B", "Numeric", ">"),
-        ("B", "Floats", "<"),
-        how="right",
-        sort_by_appearance=True,
+    actual = (
+        df[["B"]]
+        .conditional_join(
+            right[["Numeric", "Floats"]],
+            ("B", "Numeric", ">"),
+            ("B", "Floats", "<"),
+            how="right",
+        )
+        .sort_values(["Numeric", "Floats", "B"], ignore_index=True)
     )
     assert_frame_equal(expected, actual)
 
@@ -1564,18 +1598,26 @@ def test_dual_ne_extension(df, right):
     df = df.astype({"A": "Int64"})
     right = right.astype({"Integers": "Int64"})
     expected = df.merge(right, how="cross")
-    expected = expected.loc[
-        expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
-        filters,
-    ].reset_index(drop=True)
+    expected = (
+        expected.loc[
+            expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
+            filters,
+        ]
+        .reset_index(drop=True)
+        .sort_values(filters, ignore_index=True)
+    )
 
-    actual = df.conditional_join(
-        right,
-        ("A", "Integers", "!="),
-        ("B", "Numeric", "!="),
-        how="inner",
-        sort_by_appearance=True,
-    ).filter(filters)
+    actual = (
+        df.conditional_join(
+            right,
+            ("A", "Integers", "!="),
+            ("B", "Numeric", "!="),
+            how="inner",
+            sort_by_appearance=True,
+        )
+        .filter(filters)
+        .sort_values(filters, ignore_index=True)
+    )
     assert_frame_equal(expected, actual)
 
 
@@ -1623,19 +1665,27 @@ def test_dual_ne_numba_extension(df, right):
     df = df.astype({"A": "Int64"})
     right = right.astype({"Integers": "Int64"})
     expected = df.merge(right, how="cross")
-    expected = expected.loc[
-        expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
-        filters,
-    ].reset_index(drop=True)
+    expected = (
+        expected.loc[
+            expected.A.ne(expected.Integers) & expected.B.ne(expected.Numeric),
+            filters,
+        ]
+        .reset_index(drop=True)
+        .sort_values(filters, ignore_index=True)
+    )
 
-    actual = df.conditional_join(
-        right,
-        ("A", "Integers", "!="),
-        ("B", "Numeric", "!="),
-        how="inner",
-        use_numba=True,
-        sort_by_appearance=True,
-    ).filter(filters)
+    actual = (
+        df.conditional_join(
+            right,
+            ("A", "Integers", "!="),
+            ("B", "Numeric", "!="),
+            how="inner",
+            use_numba=True,
+            sort_by_appearance=True,
+        )
+        .filter(filters)
+        .sort_values(filters, ignore_index=True)
+    )
     assert_frame_equal(expected, actual)