diff --git a/docs/source/aspects.rst b/docs/source/aspects.rst index 7a42dc2..1f3143d 100644 --- a/docs/source/aspects.rst +++ b/docs/source/aspects.rst @@ -100,6 +100,14 @@ WhiteSpaces ----------------------- .. autoclass:: wildnlp.aspects.white_spaces.WhiteSpaces + :members: + :special-members: __init__ + :show-inheritance: + +Lowercase +----------------------- + +.. autoclass:: wildnlp.aspects.lowercase.LowerCase :members: :special-members: __init__ :show-inheritance: \ No newline at end of file diff --git a/tests/aspects/test_lowercase.py b/tests/aspects/test_lowercase.py new file mode 100644 index 0000000..d16e1b2 --- /dev/null +++ b/tests/aspects/test_lowercase.py @@ -0,0 +1,12 @@ +from wildnlp.aspects import LowerCase + + +def test_single_word(): + assert LowerCase()("Language") == "language" + + +def test_sentence(): + sentence = "EU rejects German call to boycott British lamb." + transformed = LowerCase()(sentence) + + assert transformed == "eu rejects german call to boycott british lamb." diff --git a/wildnlp/aspects/__init__.py b/wildnlp/aspects/__init__.py index 8ed6fd8..36c8e3d 100644 --- a/wildnlp/aspects/__init__.py +++ b/wildnlp/aspects/__init__.py @@ -9,3 +9,4 @@ from .change_char import ChangeChar from .white_spaces import WhiteSpaces from .add_sub_string import AddSubString +from .lowercase import LowerCase diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py new file mode 100644 index 0000000..f297430 --- /dev/null +++ b/wildnlp/aspects/lowercase.py @@ -0,0 +1,18 @@ +from .base import Aspect + + +class LowerCase(Aspect): + """Lower-cases the dataset. + + """ + + def __call__(self, sentence): + return " ".join([self._lowercase_word(word) + if word != '' else '' + for word in sentence.split(' ')]) + + @staticmethod + def _lowercase_word(word): + if len(word) == 0: + raise ValueError("Can't lowercase empty words") + return word.lower() diff --git a/wildnlp/datasets/conll.py b/wildnlp/datasets/conll.py index c551bba..1e5410f 100644 --- a/wildnlp/datasets/conll.py +++ b/wildnlp/datasets/conll.py @@ -54,7 +54,7 @@ def load(self, path): processed = self._process_sample(sample) self._data.append(processed) - def apply(self, aspect, apply_to_ne=False): + def apply(self, aspect, apply_to_ne=False, apply_to_both=False): """ :param aspect: transformation function @@ -62,6 +62,9 @@ def apply(self, aspect, apply_to_ne=False): :param apply_to_ne: if `False`, transformation won't be applied to Named Entities. If `True`, transformation will be applied only to Named Entities. + :param apply_to_both: if `True`, transformation will be applied + to both the Named Entities and other tokens. + :return: modified dataset in the following form: @@ -82,10 +85,13 @@ def apply(self, aspect, apply_to_ne=False): for entry in self._data: tags = entry['ner_tags'] - if apply_to_ne is False: - non_ner = np.where(tags == 'O')[0] + if not apply_to_both: + if apply_to_ne is False: + non_ner = np.where(tags == 'O')[0] + else: + non_ner = np.where(tags != 'O')[0] else: - non_ner = np.where(tags != 'O')[0] + non_ner = range(len(entry['tokens'])) if len(non_ner) == 0: modified.append(entry)