Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/biocutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from .is_list_of_type import is_list_of_type
from .is_missing_scalar import is_missing_scalar
from .map_to_index import map_to_index
from .match import match
from .match import match, create_match_index, MatchIndex
from .normalize_subscript import normalize_subscript, SubscriptTypes
from .print_truncated import print_truncated, print_truncated_dict, print_truncated_list
from .print_wrapped_table import create_floating_names, print_type, print_wrapped_table, truncate_strings
Expand Down
341 changes: 298 additions & 43 deletions src/biocutils/match.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,323 @@
from typing import Optional, Sequence, Union
from typing import Any, Optional, Literal, Union, Sequence
from functools import singledispatch

import numpy

from .map_to_index import DUPLICATE_METHOD, map_to_index

class MatchIndex:
"""
An index for matching one or more ``x`` against different ``targets``.
This is typically constructed by :py:func:`~create_match_index`.
"""

def __init__(
self,
targets: Any,
duplicate_method: Literal["first", "last", "any"] = "first",
incomparables: Union[set, Sequence] = set(),
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
):
"""
Args:
targets:
Targets to be matched against, see :py:func:`~match` for details.

duplicate_method:
How to handle duplicate entries in ``targets``, see :py:func:`~match` for details.

incomparables:
Values that cannot be compared, see :py:func:`~match` for details.

dtype:
NumPy type of the output array, see :py:func:`~match` for details.

fail_missing:
Whether to raise an error if a value cannot be found in ``targets``, see :py:func:`~match` for details.
"""

from .Factor import Factor

if isinstance(targets, dict):
# Back-compatible behavior.
import warnings

warnings.warn(DeprecationWarning("'map_to_index()' is deprecated, use 'create_match_index()' instead"))
self._map = targets

elif isinstance(targets, Factor):
# Optimized method when both x and targets are factors.
target_index = [None] * (len(targets.get_levels()) + 1) # add 1 so that code = -1 still behaves correctly.
first_tie = duplicate_method == "first" or duplicate_method == "any"
for i, code in enumerate(targets.get_codes()):
if not first_tie or target_index[code] is None:
target_index[code] = i

mapping = {}
for i, lev in enumerate(targets.get_levels()):
if lev not in incomparables:
candidate = target_index[i]
if candidate is not None:
mapping[lev] = candidate

if None not in incomparables:
# None matching to another None is still possible.
candidate = target_index[-1]
if candidate is not None:
mapping[None] = target_index[-1]

self._map = mapping

else:
first_tie = duplicate_method == "first" or duplicate_method == "any"
mapping = {}
for i, val in enumerate(targets):
if val not in incomparables:
if not first_tie or val not in mapping:
mapping[val] = i
self._map = mapping

if dtype is None:
dtype = numpy.min_scalar_type(-len(targets)) # get a signed type
self._dtype = dtype

if fail_missing is None:
fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger)
self._fail_missing = fail_missing

def match(self, x: Any) -> numpy.ndarray:
"""
Args:
x:
Values to match against ``targets``.

Returns:
NumPy array of length equal to ``x``, containing the integer position of each entry of ``x`` inside ``targets``;
see :py:func:`~match` for more details.
"""

from .Factor import Factor

indices = numpy.zeros(len(x), dtype=self._dtype)

if not isinstance(x, Factor):
# Separate loops to reduce branching in the tight inner loop.
if not self._fail_missing:
for i, y in enumerate(x):
if y in self._map:
indices[i] = self._map[y]
else:
indices[i] = -1
else:
for i, y in enumerate(x):
if y not in self._map:
raise ValueError("cannot find '" + str(y) + "' in 'targets'")
indices[i] = self._map[y]

else:
x_index = [-1] * (len(x.get_levels()) + 1) # adding 1 so that code = -1 still works.
for i, lev in enumerate(x.get_levels()):
if lev in self._map:
x_index[i] = self._map[lev]

if None in self._map:
x_index[-1] = self._map[None]

# Separate loops to reduce branching in the tight inner loop.
if self._fail_missing:
for i, code in enumerate(x.get_codes()):
candidate = x_index[code]
if candidate < 0:
raise ValueError("cannot find '" + str(x[i]) + "' in 'targets'")
indices[i] = candidate
else:
for i, code in enumerate(x.get_codes()):
indices[i] = x_index[code]

return indices


@singledispatch
def create_match_index(
targets: Any,
duplicate_method: Literal["first", "last", "any"] = "first",
incomparables: Union[set, Sequence] = set(),
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
) -> MatchIndex:
"""
Create a index for matching an arbitrary sequence against ``targets``.
Calling ``create_match_index(targets, ...).match(x)`` is equivalent to ``match(x, targets, ...)``.

Args:
targets:
Targets to be matched against, see :py:func:`~match` for details.

duplicate_method:
How to handle duplicate entries in ``targets``, see :py:func:`~match` for details.

incomparables:
Values that cannot be compared, see :py:func:`~match` for details.

dtype:
NumPy type of the output array, see :py:func:`~match` for details.

fail_missing:
Whether to raise an error if a value cannot be found in ``targets``, see :py:func:`~match` for details.

Returns:
A ``MatchIndex``.
Other implementations of ``create_match_index()`` may return any object that has a ``match()`` method.

Examples:
>>> import biocutils
>>> mobj = biocutils.create_match_index(
... [
... "A",
... "B",
... "C",
... "D",
... ]
... )
>>> mobj.match(
... [
... "A",
... "B",
... "B",
... "C",
... "C",
... "D",
... "E",
... ]
... )
>>>
>>> ft = biocutils.Factor.from_sequence(
... [
... "a",
... "B",
... "c",
... "D",
... "e",
... "B",
... "D",
... ]
... )
>>> fobj = biocutils.create_match_index(
... ft
... )
>>> fx = biocutils.Factor.from_sequence(
... [
... "A",
... "B",
... "B",
... "C",
... "C",
... "D",
... "E",
... ]
... )
>>> fobj.match(fx)
"""

return MatchIndex(
targets, duplicate_method=duplicate_method, incomparables=incomparables, dtype=dtype, fail_missing=fail_missing
)


@singledispatch
def match(
x: Sequence,
targets: Union[dict, Sequence],
duplicate_method: DUPLICATE_METHOD = "first",
x: Any,
targets: Any,
duplicate_method: Literal["first", "last", "any"] = "first",
incomparables: Union[set, Sequence] = set(),
dtype: Optional[numpy.dtype] = None,
fail_missing: Optional[bool] = None,
) -> numpy.ndarray:
"""Find a matching value of each element of ``x`` in ``target``.
"""
Find a matching value of each element of ``x`` in ``targets``.
Calling ``match(x, targets, ...)`` should be equivalent to ``create_match_index(targets, ...).match(x)``.

Args:
x:
Sequence of values to match.
Values to match against ``targets``.

targets:
Sequence of targets to be matched against. Alternatively, a
dictionary generated by passing a sequence of targets to
:py:meth:`~biocutils.map_to_index.map_to_index`.
Targets to be matched against.
It is not strictly necessary that ``x`` is of the same type as ``targets``,
but entries of ``x`` should be capable of being equal to entries of ``x``.

duplicate_method:
How to handle duplicate entries in ``targets``. Matches can
be reported to the first or last occurrence of duplicates.
How to handle duplicate entries in ``targets``.
Either the first, last or any occurrence of each target is reported.

incomparables:
Values of ``x`` or ``targets`` that cannot be compared.
No match will be reported for any value of ``x`` that is in ``incomparables``.
Any object that has an ``__in__`` method can be used here.

dtype:
NumPy type of the output array. This should be an integer type; if
missing values are expected, the type should be a signed integer.
If None, a suitable signed type is automatically determined.
NumPy type of the output array.
This should be an integer type; if missing values are expected, the type should be a signed integer.
If ``None``, a suitable signed type is automatically determined.

fail_missing:
Whether to raise an error if ``x`` cannot be found in ``targets``.
If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned
type, otherwise it defaults to ``False``.
If ``None``, this defaults to ``True`` if ``dtype`` is an unsigned type, otherwise it defaults to ``False``.

Returns:
Array of length equal to ``x``, containing the integer position of each
entry of ``x`` inside ``target``; or -1, if the entry of ``x`` is
None or cannot be found in ``target``.
"""
if not isinstance(targets, dict):
targets = map_to_index(targets, duplicate_method=duplicate_method)
NumPy array of length equal to ``x``, containing the integer position of each entry of ``x`` inside ``targets``;
or -1, if the entry of ``x`` is ``None`` or cannot be found in ``targets``.

if dtype is None:
dtype = numpy.min_scalar_type(-len(targets)) # get a signed type
indices = numpy.zeros(len(x), dtype=dtype)

if fail_missing is None:
fail_missing = numpy.issubdtype(dtype, numpy.unsignedinteger)
Examples:
>>> import biocutils
>>> biocutils.match(
... [
... "A",
... "B",
... "B",
... "C",
... "D",
... "D",
... "E",
... ],
... [
... "A",
... "B",
... "C",
... "D",
... ],
... )
>>>
>>> fx = biocutils.Factor.from_sequence(
... [
... "A",
... "B",
... "B",
... "C",
... "C",
... "D",
... "E",
... ]
... )
>>> ft = biocutils.Factor.from_sequence(
... [
... "a",
... "B",
... "c",
... "D",
... "e",
... "B",
... "D",
... ]
... )
>>> biocutils.match(
... fx,
... ft,
... duplicate_method="last",
... )
"""

# Separate loops to reduce branching in the tight inner loop.
if not fail_missing:
for i, y in enumerate(x):
if y in targets:
indices[i] = targets[y]
else:
indices[i] = -1
else:
for i, y in enumerate(x):
if y not in targets:
raise ValueError("cannot find '" + str(y) + "' in 'targets'")
indices[i] = targets[y]

return indices
obj = create_match_index(
targets, duplicate_method=duplicate_method, incomparables=incomparables, dtype=dtype, fail_missing=fail_missing
)
return obj.match(x)
Loading