diff --git a/.gitignore b/.gitignore index d5e11fb8b..bd9b98b07 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,4 @@ tags *.profraw /scratch.py midpoint.csv +examples/notebooks/cond_join.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 66dd5b817..430870553 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## [Unreleased] -- [ENH] Added `row_count` parameter for janitor.conditional_join - Issue #1269 @samukweku +- [ENH] `return_ragged_arrays` deprecated; get_join_indices function now returns a dictionary - Issue #520 @samukweku - [ENH] Reverse deprecation of `pivot_wider()` -- Issue #1464 - [ENH] Add accessor and method for pandas DataFrameGroupBy objects. - Issue #587 @samukweku - [ENH] Call mutate/summarise directly on groupby objects instead. Also add `ungroup` method to expose underlying dataframe of a grouped object. - Issue #1511 @samukweku diff --git a/janitor/functions/_conditional_join/_greater_than_indices.py b/janitor/functions/_conditional_join/_greater_than_indices.py new file mode 100644 index 000000000..c5925fe10 --- /dev/null +++ b/janitor/functions/_conditional_join/_greater_than_indices.py @@ -0,0 +1,141 @@ +# helper functions for >/>= +import numpy as np +import pandas as pd + +from janitor.functions._conditional_join._helpers import ( + _null_checks_cond_join, + _sort_if_not_monotonic, +) + + +def _ge_gt_indices( + left: pd.array, + left_index: np.ndarray, + right: pd.array, + strict: bool, +) -> tuple | None: + """ + Use binary search to get indices where left + is greater than or equal to right. + + If strict is True, then only indices + where `left` is greater than + (but not equal to) `right` are returned. + """ + search_indices = right.searchsorted(left, side="right") + # if any of the positions in `search_indices` + # is equal to 0 (less than 1), it implies that + # left[position] is not greater than any value + # in right + booleans = search_indices > 0 + if not booleans.any(): + return None + if not booleans.all(): + left = left[booleans] + left_index = left_index[booleans] + search_indices = search_indices[booleans] + # the idea here is that if there are any equal values + # shift downwards to the immediate next position + # that is not equal + if strict: + booleans = left == right[search_indices - 1] + # replace positions where rows are equal with + # searchsorted('left'); + # this works fine since we will be using the value + # as the right side of a slice, which is not included + # in the final computed value + if booleans.any(): + replacements = right.searchsorted(left, side="left") + # now we can safely replace values + # with strictly greater than positions + search_indices = np.where(booleans, replacements, search_indices) + # any value less than 1 should be discarded + # since the lowest value for binary search + # with side='right' should be 1 + booleans = search_indices > 0 + if not booleans.any(): + return None + if not booleans.all(): + left_index = left_index[booleans] + search_indices = search_indices[booleans] + return left_index, search_indices + + +def _greater_than_indices( + left: pd.Series, + right: pd.Series, + strict: bool, + keep: str, + return_matching_indices: bool, +) -> dict | None: + """ + Use binary search to get indices where left + is greater than or equal to right. + + If strict is True, then only indices + where `left` is greater than + (but not equal to) `right` are returned. + """ + # quick break, avoiding the hassle + if left.max() < right.min(): + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + outcome = _null_checks_cond_join(series=left) + if outcome is None: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + left, _ = outcome + outcome = _null_checks_cond_join(series=right) + if outcome is None: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + right, any_nulls = outcome + right, right_is_sorted = _sort_if_not_monotonic(series=right) + outcome = _ge_gt_indices( + left=left.array, + right=right.array, + left_index=left.index._values, + strict=strict, + ) + if outcome is None: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + left_index, search_indices = outcome + right_index = right.index._values + if right_is_sorted & (keep == "first"): + indexer = np.zeros_like(search_indices) + return {"left_index": left_index, "right_index": right_index[indexer]} + if right_is_sorted & (keep == "last") & any_nulls: + return { + "left_index": left_index, + "right_index": right_index[search_indices - 1], + } + if right_is_sorted & (keep == "last"): + return {"left_index": left_index, "right_index": search_indices - 1} + if keep == "first": + right = [right_index[:ind] for ind in search_indices] + right = [arr.min() for arr in right] + return {"left_index": left_index, "right_index": right} + if keep == "last": + right = [right_index[:ind] for ind in search_indices] + right = [arr.max() for arr in right] + return {"left_index": left_index, "right_index": right} + if return_matching_indices: + return dict( + left_index=left_index, + right_index=right_index, + starts=np.repeat(0, search_indices.size), + ends=search_indices, + ) + right = [right_index[:ind] for ind in search_indices] + right = np.concatenate(right) + left = left_index.repeat(search_indices) + return {"left_index": left, "right_index": right} diff --git a/janitor/functions/_conditional_join/_helpers.py b/janitor/functions/_conditional_join/_helpers.py new file mode 100644 index 000000000..2eddcb908 --- /dev/null +++ b/janitor/functions/_conditional_join/_helpers.py @@ -0,0 +1,82 @@ +# helper functions for conditional_join.py + +from enum import Enum +from typing import Sequence + +import numpy as np +import pandas as pd + + +class _JoinOperator(Enum): + """ + List of operators used in conditional_join. + """ + + GREATER_THAN = ">" + LESS_THAN = "<" + GREATER_THAN_OR_EQUAL = ">=" + LESS_THAN_OR_EQUAL = "<=" + STRICTLY_EQUAL = "==" + NOT_EQUAL = "!=" + + +less_than_join_types = { + _JoinOperator.LESS_THAN.value, + _JoinOperator.LESS_THAN_OR_EQUAL.value, +} +greater_than_join_types = { + _JoinOperator.GREATER_THAN.value, + _JoinOperator.GREATER_THAN_OR_EQUAL.value, +} + + +def _maybe_remove_nulls_from_dataframe( + df: pd.DataFrame, columns: Sequence, return_bools: bool = False +): + """ + Remove nulls if op is not !=; + """ + any_nulls = df.loc[:, [*columns]].isna().any(axis=1) + if any_nulls.all(): + return None + if return_bools: + any_nulls = ~any_nulls + return any_nulls + if any_nulls.any(): + df = df.loc[~any_nulls] + return df + + +def _null_checks_cond_join(series: pd.Series) -> tuple | None: + """ + Checks for nulls in the pandas series before conducting binary search. + """ + any_nulls = series.isna() + if any_nulls.all(): + return None + if any_nulls.any(): + series = series[~any_nulls] + return series, any_nulls.any() + + +def _sort_if_not_monotonic(series: pd.Series) -> pd.Series | None: + """ + Sort the pandas `series` if it is not monotonic increasing + """ + + is_sorted = series.is_monotonic_increasing + if not is_sorted: + series = series.sort_values(kind="stable") + return series, is_sorted + + +def _keep_output(keep: str, left: np.ndarray, right: np.ndarray): + """return indices for left and right index based on the value of `keep`.""" + if keep == "all": + return left, right + grouped = pd.Series(right).groupby(left, sort=False) + if keep == "first": + grouped = grouped.min() + return grouped.index, grouped._values + grouped = grouped.max() + return grouped.index, grouped._values diff --git a/janitor/functions/_conditional_join/_less_than_indices.py b/janitor/functions/_conditional_join/_less_than_indices.py new file mode 100644 index 000000000..b96d6f1fb --- /dev/null +++ b/janitor/functions/_conditional_join/_less_than_indices.py @@ -0,0 +1,150 @@ +# helper functions for tuple | None: + """ + Use binary search to get indices where left + is less than or equal to right. + + If strict is True, then only indices + where `left` is less than + (but not equal to) `right` are returned. + + Returns the left index and the binary search positions for left in right. + """ + search_indices = right.searchsorted(left, side="left") + # if any of the positions in `search_indices` + # is equal to the length of `right_keys` + # that means the respective position in `left` + # has no values from `right` that are less than + # or equal, and should therefore be discarded + len_right = right.size + booleans = search_indices < len_right + if not booleans.any(): + return None + if not booleans.all(): + left = left[booleans] + left_index = left_index[booleans] + search_indices = search_indices[booleans] + # the idea here is that if there are any equal values + # shift to the right to the immediate next position + # that is not equal + if strict: + booleans = left == right[search_indices] + # replace positions where rows are equal + # with positions from searchsorted('right') + # positions from searchsorted('right') will never + # be equal and will be the furthermost in terms of position + # example : right -> [2, 2, 2, 3], and we need + # positions where values are not equal for 2; + # the furthermost will be 3, and searchsorted('right') + # will return position 3. + if booleans.any(): + replacements = right.searchsorted(left, side="right") + # now we can safely replace values + # with strictly less than positions + search_indices = np.where(booleans, replacements, search_indices) + # check again if any of the values + # have become equal to length of right + # and get rid of them + booleans = search_indices < len_right + if not booleans.any(): + return None + if not booleans.all(): + left_index = left_index[booleans] + search_indices = search_indices[booleans] + return left_index, search_indices + + +def _less_than_indices( + left: pd.Series, + right: pd.Series, + strict: bool, + keep: str, + return_matching_indices: bool, +) -> dict | None: + """ + Use binary search to get indices where left + is less than or equal to right. + + If strict is True, then only indices + where `left` is less than + (but not equal to) `right` are returned. + """ + # no point going through all the hassle + if left.min() > right.max(): + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + outcome = _null_checks_cond_join(series=left) + if not outcome: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + left, _ = outcome + outcome = _null_checks_cond_join(series=right) + if not outcome: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + right, any_nulls = outcome + right, right_is_sorted = _sort_if_not_monotonic(series=right) + outcome = _le_lt_indices( + left=left.array, + right=right.array, + left_index=left.index._values, + strict=strict, + ) + if not outcome: + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + left_index, search_indices = outcome + len_right = right.size + right_index = right.index._values + if right_is_sorted & (keep == "last"): + indexer = np.empty_like(search_indices) + indexer[:] = len_right - 1 + return {"left_index": left_index, "right_index": right_index[indexer]} + if right_is_sorted & (keep == "first") & any_nulls: + return { + "left_index": left_index, + "right_index": right_index[search_indices], + } + if right_is_sorted & (keep == "first"): + return {"left_index": left_index, "right_index": search_indices} + if keep == "first": + right = [right_index[ind:len_right] for ind in search_indices] + right = [arr.min() for arr in right] + return {"left_index": left_index, "right_index": right} + if keep == "last": + right = [right_index[ind:len_right] for ind in search_indices] + right = [arr.max() for arr in right] + return {"left_index": left_index, "right_index": right} + if return_matching_indices: + return dict( + left_index=left_index, + right_index=right_index, + starts=search_indices, + ends=np.repeat(len_right, search_indices.size), + ) + right = [right_index[ind:len_right] for ind in search_indices] + right = np.concatenate(right) + left = left_index.repeat(len_right - search_indices) + return {"left_index": left, "right_index": right} diff --git a/janitor/functions/_conditional_join/_not_equal_indices.py b/janitor/functions/_conditional_join/_not_equal_indices.py new file mode 100644 index 000000000..31b959641 --- /dev/null +++ b/janitor/functions/_conditional_join/_not_equal_indices.py @@ -0,0 +1,115 @@ +# helper functions for != +import numpy as np +import pandas as pd + +from janitor.functions._conditional_join._greater_than_indices import ( + _ge_gt_indices, +) +from janitor.functions._conditional_join._helpers import ( + _keep_output, + _null_checks_cond_join, + _sort_if_not_monotonic, +) +from janitor.functions._conditional_join._less_than_indices import ( + _le_lt_indices, +) + + +def _not_equal_indices( + left: pd.Series, right: pd.Series, keep: str +) -> dict | None: + """ + Use binary search to get indices where + `left` is exactly not equal to `right`. + + It is a combination of strictly less than + and strictly greater than indices. + """ + + dummy = np.array([], dtype=int) + + # deal with nulls + l1_nulls = dummy + r1_nulls = dummy + l2_nulls = dummy + r2_nulls = dummy + lt_left = [dummy] + lt_right = [dummy] + gt_left = [dummy] + gt_right = [dummy] + any_left_nulls = left.isna() + any_right_nulls = right.isna() + if any_left_nulls.any(): + l1_nulls = left.index[any_left_nulls.array] + l1_nulls = l1_nulls.to_numpy(copy=False) + r1_nulls = right.index + # avoid NAN duplicates + if any_right_nulls.any(): + r1_nulls = r1_nulls[~any_right_nulls.array] + r1_nulls = r1_nulls.to_numpy(copy=False) + nulls_count = l1_nulls.size + # blow up nulls to match length of right + l1_nulls = np.tile(l1_nulls, r1_nulls.size) + # ensure length of right matches left + if nulls_count > 1: + r1_nulls = np.repeat(r1_nulls, nulls_count) + if any_right_nulls.any(): + r2_nulls = right.index[any_right_nulls.array] + r2_nulls = r2_nulls.to_numpy(copy=False) + l2_nulls = left.index + right = right[~any_right_nulls] + nulls_count = r2_nulls.size + # blow up nulls to match length of left + r2_nulls = np.tile(r2_nulls, l2_nulls.size) + # ensure length of left matches right + if nulls_count > 1: + l2_nulls = np.repeat(l2_nulls, nulls_count) + + l1_nulls = [l1_nulls, l2_nulls] + r1_nulls = [r1_nulls, r2_nulls] + check1 = _null_checks_cond_join(series=left) + check2 = _null_checks_cond_join(series=right) + if (check1 is None) or (check2 is None): + lt_left = [dummy] + lt_right = [dummy] + else: + left, _ = check1 + right, _ = check2 + right, _ = _sort_if_not_monotonic(series=right) + right_index = right.index._values + outcome = _le_lt_indices( + left=left.array, + left_index=left.index._values, + right=right.array, + strict=True, + ) + if outcome is not None: + len_right = right.size + lt_left, search_indices = outcome + lt_right = [right_index[ind:len_right] for ind in search_indices] + lt_left = [lt_left.repeat(len_right - search_indices)] + outcome = _ge_gt_indices( + left=left.array, + right=right.array, + left_index=left.index._values, + strict=True, + ) + if outcome is not None: + gt_left, search_indices = outcome + gt_right = [right_index[:ind] for ind in search_indices] + gt_left = [gt_left.repeat(search_indices)] + lt_left.extend(gt_left) + lt_left.extend(l1_nulls) + lt_right.extend(gt_right) + lt_right.extend(r1_nulls) + left = np.concatenate(lt_left) + right = np.concatenate(lt_right) + if (not left.size) & (not right.size): + return { + "left_index": np.array([], dtype=np.intp), + "right_index": np.array([], dtype=np.intp), + } + outcome = _keep_output(keep, left, right) + outcome = zip(["left_index", "right_index"], outcome) + outcome = dict(outcome) + return outcome diff --git a/janitor/functions/_conditional_join/_single_join.py b/janitor/functions/_conditional_join/_single_join.py new file mode 100644 index 000000000..d27c1e0b8 --- /dev/null +++ b/janitor/functions/_conditional_join/_single_join.py @@ -0,0 +1,52 @@ +# helper function for a single join + +import pandas as pd + +from janitor.functions._conditional_join._greater_than_indices import ( + _greater_than_indices, +) +from janitor.functions._conditional_join._helpers import ( + greater_than_join_types, + less_than_join_types, +) +from janitor.functions._conditional_join._less_than_indices import ( + _less_than_indices, +) +from janitor.functions._conditional_join._not_equal_indices import ( + _not_equal_indices, +) + + +def _single_join( + df: pd.DataFrame, + right: pd.DataFrame, + condition: tuple, + keep: str, + return_matching_indices: bool, +) -> dict | None: + """ + Compute indices for a single join + """ + left_on, right_on, op = condition + if op in less_than_join_types: + return _less_than_indices( + left=df[left_on], + right=right[right_on], + strict=op == "<", + keep=keep, + return_matching_indices=return_matching_indices, + ) + if op in greater_than_join_types: + return _greater_than_indices( + left=df[left_on], + right=right[right_on], + strict=op == ">", + keep=keep, + return_matching_indices=return_matching_indices, + ) + if op == "!=": + return _not_equal_indices( + left=df[left_on], + right=right[right_on], + keep=keep, + ) diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py index 4dc9d863f..24e253cc6 100644 --- a/janitor/functions/conditional_join.py +++ b/janitor/functions/conditional_join.py @@ -25,7 +25,9 @@ greater_than_join_types, less_than_join_types, ) -from janitor.utils import check, check_column +from janitor.utils import check, check_column, deprecated_kwargs + +from ._conditional_join import _single_join @pf.register_dataframe_method @@ -39,7 +41,6 @@ def conditional_join( keep: Literal["first", "last", "all"] = "all", use_numba: bool = False, indicator: Optional[Union[bool, str]] = False, - row_count: str = None, force: bool = False, ) -> pd.DataFrame: """The conditional_join function operates similarly to `pd.merge`, @@ -235,8 +236,6 @@ def conditional_join( - Added support for timedelta dtype. - 0.28.0 - `col` class deprecated. - - 0.32.0 - - Added `row_count` parameter. Args: df: A pandas DataFrame. @@ -267,8 +266,6 @@ def conditional_join( `right_only` for observations whose merge key only appears in the right DataFrame, and `both` if the observation’s merge key is found in both DataFrames. - row_count: If not None, adds a new column that captures the number of matching rows - from `right` for each row in `df`. force: If `True`, force the non-equi join conditions to execute before the equi join. @@ -287,7 +284,6 @@ def conditional_join( use_numba=use_numba, indicator=indicator, force=force, - row_count=row_count, ) @@ -318,8 +314,6 @@ def _conditional_join_preliminary_checks( indicator: Union[bool, str], force: bool, return_matching_indices: bool = False, - return_ragged_arrays: bool = False, - row_count: str = None, ) -> tuple: """ Preliminary checks for conditional_join are conducted here. @@ -403,19 +397,6 @@ def _conditional_join_preliminary_checks( check("force", force, [bool]) - check("return_ragged_arrays", return_ragged_arrays, [bool]) - - if row_count is not None: - check("row_count", row_count, [Hashable]) - if row_count in df.columns: - raise pd.errors.DuplicateLabelError( - f"{row_count} already exists as a column label in df." - ) - if keep != "all": - raise ValueError("row_count applies only when `keep=all`") - if how != "left": - raise ValueError("row_count applies only when `how=left`") - return ( df, right, @@ -427,8 +408,6 @@ def _conditional_join_preliminary_checks( use_numba, indicator, force, - return_ragged_arrays, - row_count, ) @@ -480,8 +459,6 @@ def _conditional_join_compute( indicator: Union[bool, str], force: bool, return_matching_indices: bool = False, - return_ragged_arrays: bool = False, - row_count: str = None, ) -> pd.DataFrame: """ This is where the actual computation @@ -499,8 +476,6 @@ def _conditional_join_compute( use_numba, indicator, force, - return_ragged_arrays, - row_count, ) = _conditional_join_preliminary_checks( df=df, right=right, @@ -513,8 +488,6 @@ def _conditional_join_compute( indicator=indicator, force=force, return_matching_indices=return_matching_indices, - return_ragged_arrays=return_ragged_arrays, - row_count=row_count, ) eq_check = False le_lt_check = False @@ -540,8 +513,8 @@ def _conditional_join_compute( keep=keep, use_numba=use_numba, force=force, - return_ragged_arrays=return_ragged_arrays, - row_count=row_count, + return_ragged_arrays=False, + row_count=False, ) elif (len(conditions) > 1) & le_lt_check: result = _multiple_conditional_join_le_lt( @@ -550,8 +523,8 @@ def _conditional_join_compute( conditions=conditions, keep=keep, use_numba=use_numba, - return_ragged_arrays=return_ragged_arrays, - row_count=row_count, + return_ragged_arrays=False, + row_count=False, ) elif len(conditions) > 1: result = _multiple_conditional_join_ne( @@ -559,42 +532,34 @@ def _conditional_join_compute( right=right, conditions=conditions, keep=keep, - row_count=row_count, - ) - elif use_numba: - result = _numba_single_non_equi_join( - left=df[left_on], - right=right[right_on], - op=op, - keep=keep, - row_count=row_count, + row_count=False, ) else: - result = _generic_func_cond_join( - left=df[left_on], - right=right[right_on], - op=op, - multiple_conditions=False, + # TODO: handle numba computations separately + result = _single_join._single_join( + df=df, + right=right, + condition=conditions[0], keep=keep, - return_ragged_arrays=return_ragged_arrays, - row_count=row_count, + return_matching_indices=return_matching_indices, ) - if row_count: - if (df_columns is not None) and (df_columns != slice(None)): - df = df.select(columns=df_columns) - df = df[:] - df[row_count] = 0 - if result is None: - return df - _, result = df[row_count].align(result, join="left", fill_value=0) - df[row_count] = result - return df - if result is None: - result = np.array([], dtype=np.intp), np.array([], dtype=np.intp) if return_matching_indices: return result - + # TODO: unify into single approach + if len(conditions) == 1: + return _create_frame( + df=df, + right=right, + left_index=result["left_index"], + right_index=result["right_index"], + how=how, + df_columns=df_columns, + right_columns=right_columns, + indicator=indicator, + ) + if result is None: + result = np.array([], dtype=np.intp), np.array([], dtype=np.intp) left_index, right_index = result return _create_frame( df=df, @@ -1532,6 +1497,7 @@ def _inner( return pd.DataFrame(dictionary, copy=False) +@deprecated_kwargs("return_ragged_arrays") def get_join_indices( df: pd.DataFrame, right: Union[pd.DataFrame, pd.Series], @@ -1539,8 +1505,7 @@ def get_join_indices( keep: Literal["first", "last", "all"] = "all", use_numba: bool = False, force: bool = False, - return_ragged_arrays: bool = False, -) -> tuple[np.ndarray, np.ndarray]: +) -> dict: """Convenience function to return the matching indices from an inner join. !!! info "New in version 0.27.0" @@ -1549,6 +1514,9 @@ def get_join_indices( - 0.29.0 - Add support for ragged array indices. + - 0.32.0 + - ragged array indices deprecated. + - return indices as a dictionary Args: df: A pandas DataFrame. @@ -1557,7 +1525,7 @@ def get_join_indices( `(left_on, right_on, op)`, where `left_on` is the column label from `df`, `right_on` is the column label from `right`, while `op` is the operator. - The `col` class is also supported. The operator can be any of + The operator can be any of `==`, `!=`, `<=`, `<`, `>=`, `>`. For multiple conditions, the and(`&`) operator is used to combine the results of the individual conditions. @@ -1565,14 +1533,9 @@ def get_join_indices( keep: Choose whether to return the first match, last match or all matches. force: If `True`, force the non-equi join conditions to execute before the equi join. - return_ragged_arrays: If `True`, return slices/ranges of matching right indices - for each matching left index. Not applicable if `use_numba` is `True`. - If `return_ragged_arrays` is `True`, the join condition - should be a single join, or a range join, - where the right columns are both monotonically increasing. Returns: - A tuple of indices for the rows in the dataframes that match. + A dictionary of indices for the rows in the dataframes that match. """ return _conditional_join_compute( df=df, @@ -1586,7 +1549,6 @@ def get_join_indices( indicator=False, force=force, return_matching_indices=True, - return_ragged_arrays=return_ragged_arrays, ) diff --git a/tests/functions/test_conditional_join.py b/tests/functions/test_conditional_join.py index 168ea3e77..c1c3bbae2 100644 --- a/tests/functions/test_conditional_join.py +++ b/tests/functions/test_conditional_join.py @@ -3,9 +3,9 @@ import pytest from hypothesis import given, settings from pandas import Timedelta -from pandas.testing import assert_frame_equal, assert_index_equal +from pandas.testing import assert_frame_equal -from janitor import get_join_indices +# from janitor import get_join_indices from janitor.testing_utils.strategies import ( conditional_df, conditional_right, @@ -63,44 +63,6 @@ def test_df_columns_right_columns_both_None(dummy, series): ) -def test_type_row_count(dummy, series): - """Test type for row_count""" - with pytest.raises(TypeError, match="row_count should be one of.+"): - dummy.conditional_join(series, ("id", "B", ">"), row_count={"a": 2}) - - -def test_duplicated_row_count(dummy, series): - """raise if row_count is duplicated""" - with pytest.raises( - pd.errors.DuplicateLabelError, match="id already exists as a column.+" - ): - dummy.conditional_join(series, ("id", "B", ">"), row_count="id") - - -def test_how_row_count(dummy, series): - """raise if how != left""" - with pytest.raises( - ValueError, match="row_count applies only when `how=left`" - ): - dummy.conditional_join( - series, ("id", "B", ">"), row_count="row_count", how="inner" - ) - - -def test_keep_row_count(dummy, series): - """raise if keep != all""" - with pytest.raises( - ValueError, match="row_count applies only when `keep=all`" - ): - dummy.conditional_join( - series, - ("id", "B", ">"), - row_count="row_count", - how="left", - keep="first", - ) - - def test_df_multiindex(dummy, series): """Raise ValueError if `df` columns is a MultiIndex.""" with pytest.raises( @@ -243,18 +205,6 @@ def test_check_force_type(dummy, series): dummy.conditional_join(series, ("id", "B", "<"), force=1) -def test_check_return_ragged_arrays_type(dummy, series): - """ - Raise TypeError if `return_ragged_arrays` is not boolean. - """ - with pytest.raises( - TypeError, match="return_ragged_arrays should be one of.+" - ): - get_join_indices( - dummy, series, [("id", "B", "<")], return_ragged_arrays=1 - ) - - def test_check_how_value(dummy, series): """ Raise ValueError if `how` is not one of @@ -600,150 +550,6 @@ def test_single_condition_less_than_floats(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_lt_floats_row_count(df, right): - """Test output for a single condition. "<".""" - df = df.loc[:, ["B"]].assign(index=df.index) - right = right.loc[:, ["Numeric"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.B.lt(df.Numeric)] - .groupby(["B", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["B", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "<"), - how="left", - row_count="row_count", - df_columns="B", - ) - .astype({"row_count": int}) - .sort_values(["B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_numba_lt_floats_row_count(df, right): - """Test output for a single condition. "<".""" - df = df.loc[:, ["B"]].assign(index=df.index) - right = right.loc[:, ["Numeric"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.B.lt(df.Numeric)] - .groupby(["B", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["B", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "<"), - how="left", - row_count="row_count", - df_columns="B", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_le_floats_row_count(df, right): - """Test output for a single condition. "<=".""" - - df = df.loc[:, ["B"]].assign(index=df.index) - right = right.loc[:, ["Numeric"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.B.le(df.Numeric)] - .groupby(["B", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["B", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "<="), - how="left", - row_count="row_count", - df_columns="B", - ) - .astype({"row_count": int}) - .sort_values(["B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_numba_le_floats_row_count(df, right): - """Test output for a single condition. "<=".""" - - df = df.loc[:, ["B"]].assign(index=df.index) - right = right.loc[:, ["Numeric"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.B.le(df.Numeric)] - .groupby(["B", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["B", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "<="), - how="left", - row_count="row_count", - df_columns="B", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -912,105 +718,6 @@ def test_single_condition_less_than_ints(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_lt_ints_row_count(df, right): - """Test output for a single condition. "<".""" - - df = df.loc[:, ["A"]].assign(index=df.index) - right = right.loc[:, ["Integers"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.lt(df.Integers)] - .groupby(["A", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "<"), - how="left", - row_count="row_count", - df_columns="A", - ) - .astype({"row_count": int}) - .sort_values(["A"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_le_ints_row_count(df, right): - """Test output for a single condition. "<=".""" - - df = df.loc[:, ["A"]].assign(index=df.index) - right = right.loc[:, ["Integers"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.le(df.Integers)] - .groupby(["A", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "<="), - how="left", - row_count="row_count", - df_columns="A", - ) - .astype({"row_count": int}) - .sort_values(["A"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_less_than_ints_numba(df, right): - """Test output for a single condition. "<".""" - - expected = ( - df[["A"]] - .merge(right[["Integers"]], how="cross") - .loc[lambda df: df.A.lt(df.Integers)] - .sort_values(["A", "Integers"], ignore_index=True) - ) - - actual = ( - df[["A"]] - .conditional_join( - right[["Integers"]], - ("A", "Integers", "<"), - how="inner", - use_numba=True, - ) - .sort_values(["A", "Integers"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -1239,78 +946,6 @@ def test_single_condition_greater_than_datetime_numba(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_ge_ints_row_count(df, right): - """Test output for a single condition. ">=".""" - - df = df.loc[:, ["A"]].assign(index=df.index) - right = right.loc[:, ["Integers"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.ge(df.Integers)] - .groupby(["A", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - how="left", - row_count="row_count", - df_columns="A", - ) - .astype({"row_count": int}) - .sort_values(["A"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_gt_ints_row_count(df, right): - """Test output for a single condition. ">".""" - - df = df.loc[:, ["A"]].assign(index=df.index) - right = right.loc[:, ["Integers"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.gt(df.Integers)] - .groupby(["A", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "index"], ignore_index=True) - .drop(columns="index") - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">"), - how="left", - row_count="row_count", - df_columns="A", - ) - .astype({"row_count": int}) - .sort_values(["A"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -1559,13 +1194,18 @@ def test_single_condition_not_equal_floats_only(df, right): .tail(1) .drop(columns="index") .reset_index(drop=True) + .sort_values(["B", "Numeric"], ignore_index=True) ) - actual = df[["B"]].conditional_join( - right[["Numeric"]], - ("B", "Numeric", "!="), - how="inner", - keep="last", + actual = ( + df[["B"]] + .conditional_join( + right[["Numeric"]], + ("B", "Numeric", "!="), + how="inner", + keep="last", + ) + .sort_values(["B", "Numeric"], ignore_index=True) ) assert_frame_equal(expected, actual) @@ -1607,61 +1247,30 @@ def test_single_condition_not_equal_floats_only_numba(df, right): @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) -def test_single_condition_ne_dates_row_count(df, right): +def test_single_condition_not_equal_datetime(df, right): """Test output for a single condition. "!=".""" - df = df.loc[:, ["E"]].assign(index=df.index) - right = right.loc[:, ["Dates"]] - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.E.ne(df.Dates)] - .groupby(["E", "index"], dropna=False) - .size() - .rename("counter") - ) expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) + df[["E"]] + .assign(index=df.index) + .merge(right[["Dates"]], how="cross") + .loc[lambda df: df.E != df.Dates] + .groupby("index") + .head(1) .drop(columns="index") + .reset_index(drop=True) + .sort_values(["E", "Dates"], ignore_index=True) ) + actual = ( - df.conditional_join( - right, + df[["E"]] + .conditional_join( + right[["Dates"]], ("E", "Dates", "!="), - how="left", - row_count="counter", - df_columns="E", + how="inner", + keep="first", ) - .astype({"counter": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_single_condition_not_equal_datetime(df, right): - """Test output for a single condition. "!=".""" - - expected = ( - df[["E"]] - .assign(index=df.index) - .merge(right[["Dates"]], how="cross") - .loc[lambda df: df.E != df.Dates] - .groupby("index") - .head(1) - .drop(columns="index") - .reset_index(drop=True) - ) - - actual = df[["E"]].conditional_join( - right[["Dates"]], - ("E", "Dates", "!="), - how="inner", - keep="first", + .sort_values(["E", "Dates"], ignore_index=True) ) assert_frame_equal(expected, actual) @@ -1935,85 +1544,6 @@ def test_dual_conditions_gt_and_lt_dates(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_gt_and_lt_dates_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between( - df.Dates, df.Dates_Right, inclusive="neither" - ) - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">"), - ("E", "Dates_Right", "<"), - how="left", - row_count="row_count", - df_columns="E", - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_numba_gt_and_lt_dates_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between( - df.Dates, df.Dates_Right, inclusive="neither" - ) - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">"), - ("E", "Dates_Right", "<"), - how="left", - row_count="row_count", - df_columns="E", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -2076,81 +1606,6 @@ def test_dual_conditions_ge_and_le_dates(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_ge_and_le_dates_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between(df.Dates, df.Dates_Right, inclusive="both") - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("E", "Dates_Right", "<="), - how="left", - row_count="row_count", - df_columns="E", - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_ge_and_le_dates_numba_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between(df.Dates, df.Dates_Right, inclusive="both") - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("E", "Dates_Right", "<="), - how="left", - row_count="row_count", - df_columns="E", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @pytest.mark.turtle @@ -2269,85 +1724,6 @@ def test_dual_conditions_ge_and_le_dates_right_open(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_gt_and_le_dates_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between( - df.Dates, df.Dates_Right, inclusive="right" - ) - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">"), - ("E", "Dates_Right", "<="), - how="left", - row_count="row_count", - df_columns="E", - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_gt_and_le_dates_numba_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between( - df.Dates, df.Dates_Right, inclusive="right" - ) - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">"), - ("E", "Dates_Right", "<="), - how="left", - row_count="row_count", - df_columns="E", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -2554,81 +1930,6 @@ def test_dual_conditions_gt_and_lt_numbers_left_open(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_ge_and_lt_dates_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between(df.Dates, df.Dates_Right, inclusive="left") - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("E", "Dates_Right", "<"), - how="left", - row_count="row_count", - df_columns="E", - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_ge_and_lt_dates_numba_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.between(df.Dates, df.Dates_Right, inclusive="left") - ] - .groupby(["E", "index"]) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["E", "index"], ignore_index=True) - .loc[:, ["E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("E", "Dates_Right", "<"), - how="left", - row_count="row_count", - df_columns="E", - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -2821,41 +2122,6 @@ def test_dual_ne_extension(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_dual_conditions_ne_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.ne(df.Integers) & df.B.ne(df.Numeric),] - .groupby(["A", "B", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "index"], ignore_index=True) - .loc[:, ["A", "B", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "!="), - ("B", "Numeric", "!="), - how="left", - row_count="row_count", - df_columns=["A", "B"], - ) - .astype({"row_count": int}) - .sort_values(["A", "B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -3020,46 +2286,6 @@ def test_multiple_ne_dates(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_ne_row_count(df, right): - """Test output for interval conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ne(df.Integers) - & df.E.ne(df.Dates) - & df.B.ne(df.Numeric) - ] - .groupby(["A", "E", "B", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "E", "B", "index"], ignore_index=True) - .loc[:, ["A", "E", "B", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "!="), - ("E", "Dates", "!="), - ("B", "Numeric", "!="), - how="left", - row_count="row_count", - df_columns=["A", "E", "B"], - ) - .astype({"row_count": int}) - .sort_values(["A", "E", "B"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -3225,203 +2451,6 @@ def test_conditions_eq_and_gt_ne_numba(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_eq_lt_ne_row_count(df, right): - """Test output for equal and not equal conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.ne(df.Dates) - & df.B.eq(df.Numeric) - & df.A.lt(df.Integers) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "=="), - ("E", "Dates", "!="), - ("A", "Integers", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_eq_lt_ne_numba_row_count(df, right): - """Test output for equal and not equal conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.ne(df.Dates) - & df.B.eq(df.Numeric) - & df.A.lt(df.Integers) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "=="), - ("E", "Dates", "!="), - ("A", "Integers", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_eq_ge_ne_row_count(df, right): - """Test output for equal and not equal conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.ne(df.Dates) - & df.B.eq(df.Numeric) - & df.A.ge(df.Integers) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "=="), - ("E", "Dates", "!="), - ("A", "Integers", ">="), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_eq_ge_ne_numba_row_count(df, right): - """Test output for equal and not equal conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.E.ne(df.Dates) - & df.B.eq(df.Numeric) - & df.A.ge(df.Integers) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "=="), - ("E", "Dates", "!="), - ("A", "Integers", ">="), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_eq_ne_row_count(df, right): - """Test output for equal and not equal conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.E.ne(df.Dates) & df.B.eq(df.Numeric)] - .groupby(["B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["B", "E", "index"], ignore_index=True) - .loc[:, ["B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("B", "Numeric", "=="), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["B", "E"], - ) - .astype({"row_count": int}) - .sort_values(["B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -3490,87 +2519,6 @@ def test_gt_lt_ne_conditions(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_gt_lt_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.gt(df.Integers) - & df.E.ne(df.Dates) - & df.B.lt(df.Numeric) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">"), - ("B", "Numeric", "<"), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_gt_lt_ne_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.gt(df.Integers) - & df.E.ne(df.Dates) - & df.B.lt(df.Numeric) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">"), - ("B", "Numeric", "<"), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @pytest.mark.turtle @@ -3701,196 +2649,54 @@ def test_le_ne_conditions(df, right): @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) -def test_le_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) +def test_le_ne_numba_conditions(df, right): + """ + Test output for multiple conditions. + """ + + filters = ["A", "E", "Integers", "Dates"] expected = ( - df.merge(right, how="cross") + df[["A", "E"]] + .merge(right[["Integers", "Dates"]], how="cross") .loc[lambda df: df.A.le(df.Integers) & df.E.ne(df.Dates)] - .groupby(["A", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "E", "index"], ignore_index=True) - .loc[:, ["A", "E", "row_count"]] + .sort_values(filters, ignore_index=True) ) + actual = ( - df.conditional_join( - right, + df[["A", "E"]] + .conditional_join( + right[["Integers", "Dates"]], ("A", "Integers", "<="), ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "E"], + how="inner", + use_numba=True, ) - .astype({"row_count": int}) - .sort_values(["A", "E"], ignore_index=True) + .sort_values(filters, ignore_index=True) ) assert_frame_equal(expected, actual) -@pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) -def test_numba_le_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.le(df.Integers) & df.E.ne(df.Dates)] - .groupby(["A", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) +@pytest.mark.turtle +def test_gt_lt_ne_start(df, right): + """ + Test output for multiple conditions. + """ + + filters = ["A", "B", "E", "Integers", "Numeric", "Dates"] expected = ( - df.merge(expected, how="left", on=["A", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "E", "index"], ignore_index=True) - .loc[:, ["A", "E", "row_count"]] + df[["A", "B", "E"]] + .merge(right[["Integers", "Numeric", "Dates"]], how="cross") + .loc[ + lambda df: df.A.gt(df.Integers) + & df.B.lt(df.Numeric) + & df.E.ne(df.Dates) + ] + .sort_values(filters, ignore_index=True) ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "<="), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_gt_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.gt(df.Integers) & df.E.ne(df.Dates)] - .groupby(["A", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "E", "index"], ignore_index=True) - .loc[:, ["A", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">"), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "E"], - ) - .astype({"row_count": int}) - .sort_values(["A", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_numba_gt_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.gt(df.Integers) & df.E.ne(df.Dates)] - .groupby(["A", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "E", "index"], ignore_index=True) - .loc[:, ["A", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">"), - ("E", "Dates", "!="), - how="left", - row_count="row_count", - df_columns=["A", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_le_ne_numba_conditions(df, right): - """ - Test output for multiple conditions. - """ - - filters = ["A", "E", "Integers", "Dates"] - expected = ( - df[["A", "E"]] - .merge(right[["Integers", "Dates"]], how="cross") - .loc[lambda df: df.A.le(df.Integers) & df.E.ne(df.Dates)] - .sort_values(filters, ignore_index=True) - ) - - actual = ( - df[["A", "E"]] - .conditional_join( - right[["Integers", "Dates"]], - ("A", "Integers", "<="), - ("E", "Dates", "!="), - how="inner", - use_numba=True, - ) - .sort_values(filters, ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_gt_lt_ne_start(df, right): - """ - Test output for multiple conditions. - """ - - filters = ["A", "B", "E", "Integers", "Numeric", "Dates"] - expected = ( - df[["A", "B", "E"]] - .merge(right[["Integers", "Numeric", "Dates"]], how="cross") - .loc[ - lambda df: df.A.gt(df.Integers) - & df.B.lt(df.Numeric) - & df.E.ne(df.Dates) - ] - .sort_values(filters, ignore_index=True) - ) - + actual = ( df[["A", "B", "E"]] .conditional_join( @@ -3942,87 +2748,6 @@ def test_ge_le_ne_extension_array(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_ge_lt_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ne(df.Integers) - & df.E.ge(df.Dates) - & df.B.lt(df.Numeric) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("A", "Integers", "!="), - ("B", "Numeric", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_ge_lt_ne_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ne(df.Integers) - & df.E.ge(df.Dates) - & df.B.lt(df.Numeric) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("E", "Dates", ">="), - ("A", "Integers", "!="), - ("B", "Numeric", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @pytest.mark.turtle @@ -4372,92 +3097,6 @@ def test_multiple_ge_eq_and_le_numbers(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_ge_eq_and_le_numbers_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.gt(df.Numeric) - & df.B.eq(df.Floats), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "=="), - ("B", "Numeric", ">"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_ge_eq_and_le_numbers_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.gt(df.Numeric) - & df.B.eq(df.Floats), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "=="), - ("B", "Numeric", ">"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -5053,624 +3692,152 @@ def test_ge_eq_and_le_datess_numba(df, right): assert_frame_equal(expected, actual) +# @settings(deadline=None, max_examples=10) +# @given(df=conditional_df(), right=conditional_right()) +# @pytest.mark.turtle +# def test_ge_eq_and_le_datess_numba_indices(df, right): +# """compare join indices for multiple conditions.""" + +# expected = ( +# df.reset_index() +# .dropna(subset=["E"]) +# .merge( +# right.dropna(subset=["Dates"]), +# left_on="E", +# right_on="Dates", +# how="inner", +# sort=False, +# ) +# .loc[ +# lambda df: df.B.gt(df.Floats) +# & df.A.lt(df.Integers) +# & df.B.ne(df.Numeric), +# "index", +# ] +# ) +# expected = pd.Index(expected) + +# actual, _ = get_join_indices( +# df[["B", "A", "E"]], +# right[["Floats", "Integers", "Dates", "Numeric"]], +# [ +# ("A", "Integers", "<"), +# ("E", "Dates", "=="), +# ("B", "Floats", ">"), +# ("B", "Numeric", "!="), +# ], +# use_numba=True, +# ) +# actual = df.index[actual] +# assert_index_equal(expected, actual, check_names=False) + + +# @settings(deadline=None, max_examples=10) +# @given(df=conditional_df(), right=conditional_right()) +# @pytest.mark.turtle +# def test_eq_indices(df, right): +# """compare join indices for single condition.""" + +# expected = ( +# df.reset_index() +# .dropna(subset=["E"]) +# .merge( +# right.dropna(subset=["Dates"]), +# left_on="E", +# right_on="Dates", +# how="inner", +# sort=False, +# ) +# .loc[:, "index"] +# ) +# expected = pd.Index(expected) + +# actual, _ = get_join_indices( +# df, +# right, +# [ +# ("E", "Dates", "=="), +# ], +# ) +# actual = df.index[actual] +# assert_index_equal(expected, actual, check_names=False) + + +# @settings(deadline=None, max_examples=10) +# @given(df=conditional_df(), right=conditional_right()) +# @pytest.mark.turtle +# def test_ge_eq_and_le_datess_indices(df, right): +# """compare join indices for multiple conditions.""" +# expected = ( +# df.dropna(subset="E") +# .reset_index() +# .merge( +# right.dropna(subset="Dates"), +# left_on="E", +# right_on="Dates", +# how="inner", +# sort=False, +# ) +# .loc[ +# lambda df: df.B.gt(df.Floats) +# & df.A.lt(df.Integers) +# & df.B.ne(df.Numeric), +# "index", +# ] +# ) +# expected = pd.Index(expected) + +# actual, _ = get_join_indices( +# df[["B", "A", "E"]], +# right[["Floats", "Integers", "Dates", "Numeric"]], +# [ +# ("A", "Integers", "<"), +# ("E", "Dates", "=="), +# ("B", "Floats", ">"), +# ("B", "Numeric", "!="), +# ], +# ) +# actual = df.index[actual] +# assert_index_equal(expected, actual, check_names=False) + + +@pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_ge_eq_and_le_datess_numba_indices(df, right): - """compare join indices for multiple conditions.""" +def test_multiple_non_equi(df, right): + """Test output for multiple conditions.""" + columns = ["B", "A", "E", "Floats", "Integers", "Dates"] expected = ( - df.reset_index() - .dropna(subset=["E"]) - .merge( - right.dropna(subset=["Dates"]), - left_on="E", - right_on="Dates", - how="inner", - sort=False, + df.merge( + right, + how="cross", ) .loc[ - lambda df: df.B.gt(df.Floats) - & df.A.lt(df.Integers) - & df.B.ne(df.Numeric), - "index", + lambda df: df.A.ge(df.Integers) + & df.E.le(df.Dates) + & df.B.lt(df.Floats), + columns, ] + .sort_values(columns, ignore_index=True) ) - expected = pd.Index(expected) - actual, _ = get_join_indices( - df[["B", "A", "E"]], - right[["Floats", "Integers", "Dates", "Numeric"]], - [ - ("A", "Integers", "<"), - ("E", "Dates", "=="), - ("B", "Floats", ">"), - ("B", "Numeric", "!="), - ], - use_numba=True, + actual = ( + df[["B", "A", "E"]] + .conditional_join( + right[["Floats", "Integers", "Dates"]], + ("A", "Integers", ">="), + ("E", "Dates", "<="), + ("B", "Floats", "<"), + how="inner", + ) + .sort_values(columns, ignore_index=True) ) - actual = df.index[actual] - assert_index_equal(expected, actual, check_names=False) + + assert_frame_equal(expected, actual) -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_eq_indices(df, right): - """compare join indices for single condition.""" - - expected = ( - df.reset_index() - .dropna(subset=["E"]) - .merge( - right.dropna(subset=["Dates"]), - left_on="E", - right_on="Dates", - how="inner", - sort=False, - ) - .loc[:, "index"] - ) - expected = pd.Index(expected) - - actual, _ = get_join_indices( - df, - right, - [ - ("E", "Dates", "=="), - ], - ) - actual = df.index[actual] - assert_index_equal(expected, actual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_eq_indices_ragged_arrays(df, right): - """compare join indices for single condition.""" - - expected = ( - df.assign(lindex=range(len(df))) - .dropna(subset=["E"]) - .merge( - right.assign(rindex=range(len(right))).dropna(subset=["Dates"]), - left_on="E", - right_on="Dates", - how="inner", - sort=False, - ) - .loc[:, ["lindex", "rindex"]] - .sort_values(["lindex", "rindex"]) - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [ - ("E", "Dates", "=="), - ], - return_ragged_arrays=True, - ) - if isinstance(ractual, (slice, list)): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_le_indices_ragged_arrays(df, right): - """compare join indices for single condition.""" - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[lambda df: df.E.le(df.Dates), ["lindex", "rindex"]] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [ - ("E", "Dates", "<="), - ], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_lt_indices_ragged_arrays(df, right): - """compare join indices for single condition.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[lambda df: df.E.lt(df.Dates), ["lindex", "rindex"]] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [ - ("E", "Dates", "<"), - ], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_gt_indices_ragged_arrays(df, right): - """compare join indices for single condition.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[lambda df: df.E.gt(df.Dates), ["lindex", "rindex"]] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [ - ("E", "Dates", ">"), - ], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_ge_indices_ragged_arrays(df, right): - """compare join indices for single condition.""" - - expected = ( - df.assign(lindex=range(len(df))) - .dropna(subset=["E"]) - .merge( - right.assign(rindex=range(len(right))).dropna(subset=["Dates"]), - how="cross", - ) - .loc[lambda df: df.E.ge(df.Dates), ["lindex", "rindex"]] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [ - ("E", "Dates", ">="), - ], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_le_gt_indices_ragged_arrays(df, right): - """compare join indices for range join.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[ - lambda df: df.E.le(df.Dates) & df.B.gt(df.Numeric), - ["lindex", "rindex"], - ] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [("E", "Dates", "<="), ("B", "Numeric", ">")], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_le_ge_indices_ragged_arrays(df, right): - """compare join indices for range join.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[ - lambda df: df.E.le(df.Dates) & df.B.ge(df.Numeric), - ["lindex", "rindex"], - ] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [("E", "Dates", "<="), ("B", "Numeric", ">=")], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_ge_le_indices_ragged_arrays(df, right): - """compare join indices for range join.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[ - lambda df: df.E.ge(df.Dates) & df.B.le(df.Numeric), - ["lindex", "rindex"], - ] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [("E", "Dates", ">="), ("B", "Numeric", "<=")], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_range_indices_ragged_arrays(df, right): - """compare join indices for range join.""" - - expected = ( - df.assign(lindex=range(len(df))) - .merge( - right.assign(rindex=range(len(right))), - how="cross", - ) - .loc[ - lambda df: df.E.lt(df.Dates) & df.B.gt(df.Numeric), - ["lindex", "rindex"], - ] - ) - rindex = pd.Index(expected["rindex"]) - lindex = pd.Index(expected["lindex"]) - - lactual, ractual = get_join_indices( - df, - right, - [("E", "Dates", "<"), ("B", "Numeric", ">")], - return_ragged_arrays=True, - ) - if isinstance(ractual, list): - ractual = [right.index[arr] for arr in ractual] - lengths = [len(arr) for arr in ractual] - ractual = np.concatenate(ractual) - lactual = pd.Index(lactual).repeat(lengths) - ractual = pd.Index(ractual) - lactual = pd.Index(lactual) - sorter = np.lexsort((ractual, lactual)) - lactual = lactual[sorter] - ractual = ractual[sorter] - sorter = np.lexsort((rindex, lindex)) - lindex = lindex[sorter] - rindex = rindex[sorter] - assert_index_equal(rindex, ractual, check_names=False) - assert_index_equal(lindex, lactual, check_names=False) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_ge_eq_and_le_datess_indices(df, right): - """compare join indices for multiple conditions.""" - expected = ( - df.dropna(subset="E") - .reset_index() - .merge( - right.dropna(subset="Dates"), - left_on="E", - right_on="Dates", - how="inner", - sort=False, - ) - .loc[ - lambda df: df.B.gt(df.Floats) - & df.A.lt(df.Integers) - & df.B.ne(df.Numeric), - "index", - ] - ) - expected = pd.Index(expected) - - actual, _ = get_join_indices( - df[["B", "A", "E"]], - right[["Floats", "Integers", "Dates", "Numeric"]], - [ - ("A", "Integers", "<"), - ("E", "Dates", "=="), - ("B", "Floats", ">"), - ("B", "Numeric", "!="), - ], - ) - actual = df.index[actual] - assert_index_equal(expected, actual, check_names=False) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equi(df, right): - """Test output for multiple conditions.""" - - columns = ["B", "A", "E", "Floats", "Integers", "Dates"] - expected = ( - df.merge( - right, - how="cross", - ) - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats), - columns, - ] - .sort_values(columns, ignore_index=True) - ) - - actual = ( - df[["B", "A", "E"]] - .conditional_join( - right[["Floats", "Integers", "Dates"]], - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - how="inner", - ) - .sort_values(columns, ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equii_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equii_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle +@pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) def test_multiple_non_equi_numba_(df, right): @@ -5687,145 +3854,20 @@ def test_multiple_non_equi_numba_(df, right): & df.E.le(df.Dates) & df.B.lt(df.Floats), columns, - ] - .sort_values(columns, ignore_index=True) - ) - - actual = ( - df[["B", "A", "E"]] - .conditional_join( - right[["Floats", "Integers", "Dates"]], - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - how="inner", - ) - .sort_values(columns, ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -@pytest.mark.turtle -def test_multiple_non_equii(df, right): - """Test output for multiple conditions.""" - - columns = ["B", "A", "E", "Floats", "Integers", "Dates", "Numeric"] - expected = ( - df.merge( - right, - how="cross", - ) - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats) - & df.B.gt(df.Numeric), - columns, - ] - .sort_values(columns, ignore_index=True) - ) - expected = expected.filter(columns) - actual = ( - df[["B", "A", "E"]] - .conditional_join( - right[["Floats", "Integers", "Dates", "Numeric"]], - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - ("B", "Numeric", ">"), - how="inner", - ) - .sort_values(columns, ignore_index=True) - .loc[:, columns] - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats) - & df.B.gt(df.Numeric), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", ">="), - ("E", "Dates", "<="), - ("B", "Floats", "<"), - ("B", "Numeric", ">"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.le(df.Dates) - & df.B.lt(df.Floats) - & df.B.gt(df.Numeric), - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(_count=lambda df: df._count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "_count"]] + ] + .sort_values(columns, ignore_index=True) ) + actual = ( - df.conditional_join( - right, + df[["B", "A", "E"]] + .conditional_join( + right[["Floats", "Integers", "Dates"]], ("A", "Integers", ">="), ("E", "Dates", "<="), ("B", "Floats", "<"), - ("B", "Numeric", ">"), - how="left", - row_count="_count", - df_columns=["A", "B", "E"], - use_numba=True, + how="inner", ) - .astype({"_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) + .sort_values(columns, ignore_index=True) ) assert_frame_equal(expected, actual) @@ -5834,7 +3876,7 @@ def test_multiple_non_equiii_numba_row_count(df, right): @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @pytest.mark.turtle -def test_multiple_non_equii_numba_(df, right): +def test_multiple_non_equii(df, right): """Test output for multiple conditions.""" columns = ["B", "A", "E", "Floats", "Integers", "Dates", "Numeric"] @@ -5873,7 +3915,7 @@ def test_multiple_non_equii_numba_(df, right): @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @pytest.mark.turtle -def test_multiple_non_equii_col_syntax(df, right): +def test_multiple_non_equii_numba_(df, right): """Test output for multiple conditions.""" columns = ["B", "A", "E", "Floats", "Integers", "Dates", "Numeric"] @@ -5909,83 +3951,40 @@ def test_multiple_non_equii_col_syntax(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_non_range_row_count(df, right): +@pytest.mark.turtle +def test_multiple_non_equii_col_syntax(df, right): """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.ge(df.Integers) - & df.E.gt(df.Dates) - & df.B.gt(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) + + columns = ["B", "A", "E", "Floats", "Integers", "Dates", "Numeric"] expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( + df.merge( right, - ("A", "Integers", ">="), - ("E", "Dates", ">"), - ("B", "Floats", ">"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=False, + how="cross", ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_non_range_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") .loc[ lambda df: df.A.ge(df.Integers) - & df.E.gt(df.Dates) - & df.B.gt(df.Floats) + & df.E.le(df.Dates) + & df.B.lt(df.Floats) + & df.B.gt(df.Numeric), + columns, ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] + .sort_values(columns, ignore_index=True) ) + expected = expected.filter(columns) actual = ( - df.conditional_join( - right, + df[["B", "A", "E"]] + .conditional_join( + right[["Floats", "Integers", "Dates", "Numeric"]], ("A", "Integers", ">="), - ("E", "Dates", ">"), - ("B", "Floats", ">"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, + ("E", "Dates", "<="), + ("B", "Floats", "<"), + ("B", "Numeric", ">"), + how="inner", ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) + .sort_values(columns, ignore_index=True) + .loc[:, columns] ) assert_frame_equal(expected, actual) @@ -6083,88 +4082,6 @@ def test_multiple_non_eqi_numba(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_non_range_le_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.le(df.Integers) - & df.E.lt(df.Dates) - & df.B.lt(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "<="), - ("E", "Dates", "<"), - ("B", "Floats", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_non_equiii_non_range_le_numba_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.le(df.Integers) - & df.E.lt(df.Dates) - & df.B.lt(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("row_count") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(row_count=lambda df: df.row_count.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "row_count"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "<="), - ("E", "Dates", "<"), - ("B", "Floats", "<"), - how="left", - row_count="row_count", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"row_count": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -6518,210 +4435,6 @@ def test_multiple_eqs(df, right): assert_frame_equal(expected, actual) -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_eq_ne_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.eq(df.Integers) - & df.E.ne(df.Dates) - & df.B.eq(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("counter") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "counter"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "=="), - ("E", "Dates", "!="), - ("B", "Floats", "=="), - how="left", - row_count="counter", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"counter": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_eq_range_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.eq(df.Integers) - & df.E.lt(df.Dates) - & df.B.gt(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("counter") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "counter"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "=="), - ("E", "Dates", "<"), - ("B", "Floats", ">"), - how="left", - row_count="counter", - df_columns=["A", "B", "E"], - use_numba=False, - ) - .astype({"counter": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_eq_numba_range_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[ - lambda df: df.A.eq(df.Integers) - & df.E.lt(df.Dates) - & df.B.gt(df.Floats) - ] - .groupby(["A", "B", "E", "index"], dropna=False) - .size() - .rename("counter") - ) - expected = ( - df.merge(expected, how="left", on=["B", "E", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["A", "B", "E", "index"], ignore_index=True) - .loc[:, ["A", "B", "E", "counter"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "=="), - ("E", "Dates", "<"), - ("B", "Floats", ">"), - how="left", - row_count="counter", - df_columns=["A", "B", "E"], - use_numba=True, - ) - .astype({"counter": int}) - .sort_values(["A", "B", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_eq_numba_lt_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.eq(df.Integers) & df.E.lt(df.Dates)] - .groupby(["A", "E", "index"], dropna=False) - .size() - .rename("counter") - ) - expected = ( - df.merge(expected, how="left", on=["A", "E", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["A", "E", "index"], ignore_index=True) - .loc[:, ["A", "E", "counter"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "=="), - ("E", "Dates", "<"), - how="left", - row_count="counter", - df_columns=["A", "E"], - use_numba=True, - ) - .astype({"counter": int}) - .sort_values(["A", "E"], ignore_index=True) - ) - - assert_frame_equal(expected, actual) - - -@pytest.mark.turtle -@settings(deadline=None, max_examples=10) -@given(df=conditional_df(), right=conditional_right()) -def test_multiple_eq_numba_gt_row_count(df, right): - """Test output for multiple conditions.""" - df = df.assign(index=df.index) - expected = ( - df.merge(right, how="cross") - .loc[lambda df: df.A.eq(df.Integers) & df.B.gt(df.Floats)] - .groupby(["A", "B", "index"], dropna=False) - .size() - .rename("counter") - ) - expected = ( - df.merge(expected, how="left", on=["A", "B", "index"]) - .assign(counter=lambda df: df.counter.fillna(0).astype(int)) - .sort_values(["A", "B", "index"], ignore_index=True) - .loc[:, ["A", "B", "counter"]] - ) - actual = ( - df.conditional_join( - right, - ("A", "Integers", "=="), - ("B", "Floats", ">"), - how="left", - row_count="counter", - df_columns=[ - "A", - "B", - ], - use_numba=True, - ) - .astype({"counter": int}) - .sort_values( - [ - "A", - "B", - ], - ignore_index=True, - ) - ) - - assert_frame_equal(expected, actual) - - @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right())