Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/biocutils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@

from .biocobject import BiocObject
from .table import table

from .duplicated import duplicated, unique
201 changes: 201 additions & 0 deletions src/biocutils/duplicated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
from typing import Any, Union, Sequence
from functools import singledispatch

import numpy

from .Factor import Factor
from .subset import subset


@singledispatch
def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray:
"""
Find duplicated elements of ``x``.

Args:
x:
Object to be searched for duplicates.
This is usually a sequence that can be iterated over.

incomparables:
Values of ``x`` that cannot be compared.
Any value of ``x`` in ``incomparables`` will never be a duplicate.
Any object that has an ``__in__`` method can be used here.

from_last:
Whether to report the last occurrence as a non-duplicate.

Returns:
NumPy array of length equal to that of ``x``,
containing truthy values for only the first occurrence of each value of ``x``.
If ``from_last = True``, truthy values are only reported for the last occurrence of each value of ``x``.

Examples:
>>> import biocutils
>>> biocutils.duplicated(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ]
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ],
... from_last=True,
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ]
... )
>>> biocutils.duplicated(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ],
... incomparables=set(
... [None]
... ),
... )
"""

available = set()
output = numpy.ndarray(len(x), dtype=numpy.bool_)

def process(i, y):
if y in incomparables:
output[i] = False
elif y in available:
output[i] = True
else:
available.add(y)
output[i] = False

if not from_last:
for i, y in enumerate(x):
process(i, y)
else:
for i in range(len(x) - 1, -1, -1):
process(i, x[i])

return output


@duplicated.register
def _duplicated_Factor(
x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False
) -> numpy.ndarray:
present = []
for lev in x.get_levels():
if lev in incomparables:
present.append(None)
else:
present.append(False)

# Handling codes of -1, i.e., None.
if None in incomparables:
present.append(None)
else:
present.append(False)

output = numpy.ndarray(len(x), dtype=numpy.bool_)

def process(i, y):
tmp = present[y]
if tmp is None:
output[i] = False
elif tmp:
output[i] = True
else:
present[y] = True
output[i] = False

if not from_last:
for i, y in enumerate(x.get_codes()):
process(i, y)
else:
codes = x.get_codes()
for i in range(len(x) - 1, -1, -1):
process(i, codes[i])

return output


def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> Any:
"""
Get all unique values of ``x``.

Args:
x:
Object in which to find unique entries.
This is usually a sequence that can be iterated over.

incomparables:
Values of ``x`` that cannot be compared.
Any value of ``x`` in ``incomparables`` will never be a duplicate.
Any object that has an ``__in__`` method can be used here.

from_last:
Whether to retain the last occurrence of each value in ``x``.
By default, the first occurrence is retained.

Returns:
An object containing unique values of ``x``.
This is usually of the same class as ``x``.

Examples:
>>> import biocutils
>>> biocutils.unique(
... [
... 1,
... 2,
... 1,
... 2,
... 3,
... 2,
... ]
... )
>>> biocutils.unique(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ]
... )
>>> biocutils.unique(
... [
... 1,
... 2,
... None,
... None,
... 3,
... 2,
... ],
... incomparables=set(
... [None]
... ),
... )
"""
return subset(x, numpy.where(numpy.logical_not(duplicated(x, incomparables=incomparables, from_last=from_last)))[0])
23 changes: 23 additions & 0 deletions tests/test_duplicated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import biocutils


def test_duplicated_basic():
assert list(biocutils.duplicated([1,2,1,2,3,2])) == [False, False, True, True, False, True]
assert list(biocutils.duplicated([1,2,1,2,3,2], from_last=True)) == [True, True, False, True, False, False]
assert list(biocutils.duplicated([1,2,None,None,3,2,3])) == [False, False, False, True, False, True, True]
assert list(biocutils.duplicated([1,2,None,None,3,2,3], incomparables=set([None]))) == [False, False, False, False, False, True, True]


def test_duplicated_Factor():
assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]))) == [False, False, True, True, False, True]
assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False]
assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]))) == [False, False, False, True, False, True, True]
assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set([None]))) == [False, False, False, False, False, True, True]
assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set(["2"]))) == [False, False, False, True, False, False, True]


def test_unique():
assert biocutils.unique([1,2,1,2,3,2]) == [1,2,3]
assert biocutils.unique([1,2,1,2,3,2], from_last=True) == [1,3,2]
assert biocutils.unique([1,2,None,None,3,2]) == [1,2,None,3]
assert biocutils.unique([1,2,None,None,3,2], incomparables=set([None])) == [1,2,None,None,3]