Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,4 @@ tags
*.profraw
/scratch.py
midpoint.csv
examples/notebooks/cond_join.ipynb
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Changelog

## [Unreleased]
- [ENH] Added `row_count` parameter for janitor.conditional_join - Issue #1269 @samukweku
- [ENH] `return_ragged_arrays` deprecated; get_join_indices function now returns a dictionary - Issue #520 @samukweku
- [ENH] Reverse deprecation of `pivot_wider()` -- Issue #1464
- [ENH] Add accessor and method for pandas DataFrameGroupBy objects. - Issue #587 @samukweku
- [ENH] Call mutate/summarise directly on groupby objects instead. Also add `ungroup` method to expose underlying dataframe of a grouped object. - Issue #1511 @samukweku
Expand Down
141 changes: 141 additions & 0 deletions janitor/functions/_conditional_join/_greater_than_indices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# helper functions for >/>=
import numpy as np
import pandas as pd

from janitor.functions._conditional_join._helpers import (
_null_checks_cond_join,
_sort_if_not_monotonic,
)


def _ge_gt_indices(
left: pd.array,
left_index: np.ndarray,
right: pd.array,
strict: bool,
) -> tuple | None:
"""
Use binary search to get indices where left
is greater than or equal to right.

If strict is True, then only indices
where `left` is greater than
(but not equal to) `right` are returned.
"""
search_indices = right.searchsorted(left, side="right")
# if any of the positions in `search_indices`
# is equal to 0 (less than 1), it implies that
# left[position] is not greater than any value
# in right
booleans = search_indices > 0
if not booleans.any():
return None
if not booleans.all():
left = left[booleans]
left_index = left_index[booleans]
search_indices = search_indices[booleans]
# the idea here is that if there are any equal values
# shift downwards to the immediate next position
# that is not equal
if strict:
booleans = left == right[search_indices - 1]
# replace positions where rows are equal with
# searchsorted('left');
# this works fine since we will be using the value
# as the right side of a slice, which is not included
# in the final computed value
if booleans.any():
replacements = right.searchsorted(left, side="left")
# now we can safely replace values
# with strictly greater than positions
search_indices = np.where(booleans, replacements, search_indices)
# any value less than 1 should be discarded
# since the lowest value for binary search
# with side='right' should be 1
booleans = search_indices > 0
if not booleans.any():
return None
if not booleans.all():
left_index = left_index[booleans]
search_indices = search_indices[booleans]
return left_index, search_indices


def _greater_than_indices(
left: pd.Series,
right: pd.Series,
strict: bool,
keep: str,
return_matching_indices: bool,
) -> dict | None:
"""
Use binary search to get indices where left
is greater than or equal to right.

If strict is True, then only indices
where `left` is greater than
(but not equal to) `right` are returned.
"""
# quick break, avoiding the hassle
if left.max() < right.min():
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
outcome = _null_checks_cond_join(series=left)
if outcome is None:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
left, _ = outcome
outcome = _null_checks_cond_join(series=right)
if outcome is None:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
right, any_nulls = outcome
right, right_is_sorted = _sort_if_not_monotonic(series=right)
outcome = _ge_gt_indices(
left=left.array,
right=right.array,
left_index=left.index._values,
strict=strict,
)
if outcome is None:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
left_index, search_indices = outcome
right_index = right.index._values
if right_is_sorted & (keep == "first"):
indexer = np.zeros_like(search_indices)
return {"left_index": left_index, "right_index": right_index[indexer]}
if right_is_sorted & (keep == "last") & any_nulls:
return {
"left_index": left_index,
"right_index": right_index[search_indices - 1],
}
if right_is_sorted & (keep == "last"):
return {"left_index": left_index, "right_index": search_indices - 1}
if keep == "first":
right = [right_index[:ind] for ind in search_indices]
right = [arr.min() for arr in right]
return {"left_index": left_index, "right_index": right}
if keep == "last":
right = [right_index[:ind] for ind in search_indices]
right = [arr.max() for arr in right]
return {"left_index": left_index, "right_index": right}
if return_matching_indices:
return dict(
left_index=left_index,
right_index=right_index,
starts=np.repeat(0, search_indices.size),
ends=search_indices,
)
right = [right_index[:ind] for ind in search_indices]
right = np.concatenate(right)
left = left_index.repeat(search_indices)
return {"left_index": left, "right_index": right}
82 changes: 82 additions & 0 deletions janitor/functions/_conditional_join/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# helper functions for conditional_join.py

from enum import Enum
from typing import Sequence

import numpy as np
import pandas as pd


class _JoinOperator(Enum):
"""
List of operators used in conditional_join.
"""

GREATER_THAN = ">"
LESS_THAN = "<"
GREATER_THAN_OR_EQUAL = ">="
LESS_THAN_OR_EQUAL = "<="
STRICTLY_EQUAL = "=="
NOT_EQUAL = "!="


less_than_join_types = {
_JoinOperator.LESS_THAN.value,
_JoinOperator.LESS_THAN_OR_EQUAL.value,
}
greater_than_join_types = {
_JoinOperator.GREATER_THAN.value,
_JoinOperator.GREATER_THAN_OR_EQUAL.value,
}


def _maybe_remove_nulls_from_dataframe(
df: pd.DataFrame, columns: Sequence, return_bools: bool = False
):
"""
Remove nulls if op is not !=;
"""
any_nulls = df.loc[:, [*columns]].isna().any(axis=1)
if any_nulls.all():
return None
if return_bools:
any_nulls = ~any_nulls
return any_nulls
if any_nulls.any():
df = df.loc[~any_nulls]
return df


def _null_checks_cond_join(series: pd.Series) -> tuple | None:
"""
Checks for nulls in the pandas series before conducting binary search.
"""
any_nulls = series.isna()
if any_nulls.all():
return None
if any_nulls.any():
series = series[~any_nulls]
return series, any_nulls.any()


def _sort_if_not_monotonic(series: pd.Series) -> pd.Series | None:
"""
Sort the pandas `series` if it is not monotonic increasing
"""

is_sorted = series.is_monotonic_increasing
if not is_sorted:
series = series.sort_values(kind="stable")
return series, is_sorted


def _keep_output(keep: str, left: np.ndarray, right: np.ndarray):
"""return indices for left and right index based on the value of `keep`."""
if keep == "all":
return left, right
grouped = pd.Series(right).groupby(left, sort=False)
if keep == "first":
grouped = grouped.min()
return grouped.index, grouped._values
grouped = grouped.max()
return grouped.index, grouped._values
150 changes: 150 additions & 0 deletions janitor/functions/_conditional_join/_less_than_indices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# helper functions for </<=
import numpy as np
import pandas as pd

from janitor.functions._conditional_join._helpers import (
_null_checks_cond_join,
_sort_if_not_monotonic,
)


def _le_lt_indices(
left: pd.array,
left_index: np.ndarray,
right: pd.array,
strict: bool,
) -> tuple | None:
"""
Use binary search to get indices where left
is less than or equal to right.

If strict is True, then only indices
where `left` is less than
(but not equal to) `right` are returned.

Returns the left index and the binary search positions for left in right.
"""
search_indices = right.searchsorted(left, side="left")
# if any of the positions in `search_indices`
# is equal to the length of `right_keys`
# that means the respective position in `left`
# has no values from `right` that are less than
# or equal, and should therefore be discarded
len_right = right.size
booleans = search_indices < len_right
if not booleans.any():
return None
if not booleans.all():
left = left[booleans]
left_index = left_index[booleans]
search_indices = search_indices[booleans]
# the idea here is that if there are any equal values
# shift to the right to the immediate next position
# that is not equal
if strict:
booleans = left == right[search_indices]
# replace positions where rows are equal
# with positions from searchsorted('right')
# positions from searchsorted('right') will never
# be equal and will be the furthermost in terms of position
# example : right -> [2, 2, 2, 3], and we need
# positions where values are not equal for 2;
# the furthermost will be 3, and searchsorted('right')
# will return position 3.
if booleans.any():
replacements = right.searchsorted(left, side="right")
# now we can safely replace values
# with strictly less than positions
search_indices = np.where(booleans, replacements, search_indices)
# check again if any of the values
# have become equal to length of right
# and get rid of them
booleans = search_indices < len_right
if not booleans.any():
return None
if not booleans.all():
left_index = left_index[booleans]
search_indices = search_indices[booleans]
return left_index, search_indices


def _less_than_indices(
left: pd.Series,
right: pd.Series,
strict: bool,
keep: str,
return_matching_indices: bool,
) -> dict | None:
"""
Use binary search to get indices where left
is less than or equal to right.

If strict is True, then only indices
where `left` is less than
(but not equal to) `right` are returned.
"""
# no point going through all the hassle
if left.min() > right.max():
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
outcome = _null_checks_cond_join(series=left)
if not outcome:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
left, _ = outcome
outcome = _null_checks_cond_join(series=right)
if not outcome:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
right, any_nulls = outcome
right, right_is_sorted = _sort_if_not_monotonic(series=right)
outcome = _le_lt_indices(
left=left.array,
right=right.array,
left_index=left.index._values,
strict=strict,
)
if not outcome:
return {
"left_index": np.array([], dtype=np.intp),
"right_index": np.array([], dtype=np.intp),
}
left_index, search_indices = outcome
len_right = right.size
right_index = right.index._values
if right_is_sorted & (keep == "last"):
indexer = np.empty_like(search_indices)
indexer[:] = len_right - 1
return {"left_index": left_index, "right_index": right_index[indexer]}
if right_is_sorted & (keep == "first") & any_nulls:
return {
"left_index": left_index,
"right_index": right_index[search_indices],
}
if right_is_sorted & (keep == "first"):
return {"left_index": left_index, "right_index": search_indices}
if keep == "first":
right = [right_index[ind:len_right] for ind in search_indices]
right = [arr.min() for arr in right]
return {"left_index": left_index, "right_index": right}
if keep == "last":
right = [right_index[ind:len_right] for ind in search_indices]
right = [arr.max() for arr in right]
return {"left_index": left_index, "right_index": right}
if return_matching_indices:
return dict(
left_index=left_index,
right_index=right_index,
starts=search_indices,
ends=np.repeat(len_right, search_indices.size),
)
right = [right_index[ind:len_right] for ind in search_indices]
right = np.concatenate(right)
left = left_index.repeat(len_right - search_indices)
return {"left_index": left, "right_index": right}
Loading
Loading