From 84c6637a022ae6867b0227f16b048693bb887536 Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Wed, 17 Jul 2019 18:58:59 +0800 Subject: [PATCH 1/6] Adding lowercase aspect --- wildnlp/aspects/lowercase.py | 45 ++++++++++++++++++++++++++++++++++++ wildnlp/datasets/conll.py | 14 +++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 wildnlp/aspects/lowercase.py diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py new file mode 100644 index 0000000..48755d0 --- /dev/null +++ b/wildnlp/aspects/lowercase.py @@ -0,0 +1,45 @@ +import random + +from .base import Aspect + + +class LowerCase(Aspect): + """Randomly lowercases the dataset + + .. caution:: Uses random numbers, default seed is 42. + """ + + def __init__(self, words_percentage=50, seed=42): + """ + + :param words_percentage: Percentage of the words in the dataset which needs to be lowercased. + Defaults to 0.5. + + :param seed: Random seed. + """ + + if words_percentage >= 1: + words_percentage /= 100. + + random.seed(seed) + + def __call__(self, sentence): + tokens = self._tokenize(sentence) + lowercased_tokens = self._lowercase_tokens(tokens) + return self._detokenize(lowercased_tokens) + + @staticmethod + def _lowercase_tokens(tokens): + + modified = [] + for token in tokens: + token = token.lower() + modified.append(token) + return modified + + @staticmethod + def _capitalize(token): + try: + return token[0].upper() + token[1:] + except IndexError: + return token diff --git a/wildnlp/datasets/conll.py b/wildnlp/datasets/conll.py index a74585c..d3c33aa 100644 --- a/wildnlp/datasets/conll.py +++ b/wildnlp/datasets/conll.py @@ -52,7 +52,7 @@ def load(self, path): processed = self._process_sample(sample) self._data.append(processed) - def apply(self, aspect, apply_to_ne=False): + def apply(self, aspect, apply_to_ne=False, apply_to_both=False): """ :param aspect: transformation function @@ -60,6 +60,9 @@ def apply(self, aspect, apply_to_ne=False): :param apply_to_ne: if `False`, transformation won't be applied to Named Entities. If `True`, transformation will be applied only to Named Entities. + :param apply_to_both: if `True`, transformation will be applied + to both the Named Entities and other tokens. + :return: modified dataset in the following form: @@ -80,10 +83,13 @@ def apply(self, aspect, apply_to_ne=False): for entry in self._data: tags = entry['ner_tags'] - if apply_to_ne is False: - non_ner = np.where(tags == 'O')[0] + if not apply_to_both: + if apply_to_ne is False: + non_ner = np.where(tags == 'O')[0] + else: + non_ner = np.where(tags != 'O')[0] else: - non_ner = np.where(tags != 'O')[0] + non_ner = range(len(entry['tokens'])) if len(non_ner) == 0: modified.append(entry) From 6437d4609e9e9abd183a46e5dfb3e05f35098faf Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Thu, 18 Jul 2019 16:12:40 +0800 Subject: [PATCH 2/6] Simplifying the code --- wildnlp/aspects/lowercase.py | 43 +++++++----------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py index 48755d0..1ac5309 100644 --- a/wildnlp/aspects/lowercase.py +++ b/wildnlp/aspects/lowercase.py @@ -1,45 +1,18 @@ -import random - from .base import Aspect class LowerCase(Aspect): - """Randomly lowercases the dataset + """Lower-cases the dataset - .. caution:: Uses random numbers, default seed is 42. """ - def __init__(self, words_percentage=50, seed=42): - """ - - :param words_percentage: Percentage of the words in the dataset which needs to be lowercased. - Defaults to 0.5. - - :param seed: Random seed. - """ - - if words_percentage >= 1: - words_percentage /= 100. - - random.seed(seed) - def __call__(self, sentence): - tokens = self._tokenize(sentence) - lowercased_tokens = self._lowercase_tokens(tokens) - return self._detokenize(lowercased_tokens) - - @staticmethod - def _lowercase_tokens(tokens): - - modified = [] - for token in tokens: - token = token.lower() - modified.append(token) - return modified + return " ".join([self._lowercase_word(word) + if word != '' else '' + for word in sentence.split(' ')]) @staticmethod - def _capitalize(token): - try: - return token[0].upper() + token[1:] - except IndexError: - return token + def _lowercase_word(word): + if len(word) == 0: + raise ValueError("Can't lowercase empty words") + return word.lower() From d8decf325c48f28ea8871ea4dae3271881b3ddef Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Wed, 17 Jul 2019 18:58:59 +0800 Subject: [PATCH 3/6] Adding lowercase aspect --- wildnlp/aspects/lowercase.py | 45 ++++++++++++++++++++++++++++++++++++ wildnlp/datasets/conll.py | 14 +++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 wildnlp/aspects/lowercase.py diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py new file mode 100644 index 0000000..48755d0 --- /dev/null +++ b/wildnlp/aspects/lowercase.py @@ -0,0 +1,45 @@ +import random + +from .base import Aspect + + +class LowerCase(Aspect): + """Randomly lowercases the dataset + + .. caution:: Uses random numbers, default seed is 42. + """ + + def __init__(self, words_percentage=50, seed=42): + """ + + :param words_percentage: Percentage of the words in the dataset which needs to be lowercased. + Defaults to 0.5. + + :param seed: Random seed. + """ + + if words_percentage >= 1: + words_percentage /= 100. + + random.seed(seed) + + def __call__(self, sentence): + tokens = self._tokenize(sentence) + lowercased_tokens = self._lowercase_tokens(tokens) + return self._detokenize(lowercased_tokens) + + @staticmethod + def _lowercase_tokens(tokens): + + modified = [] + for token in tokens: + token = token.lower() + modified.append(token) + return modified + + @staticmethod + def _capitalize(token): + try: + return token[0].upper() + token[1:] + except IndexError: + return token diff --git a/wildnlp/datasets/conll.py b/wildnlp/datasets/conll.py index c551bba..1e5410f 100644 --- a/wildnlp/datasets/conll.py +++ b/wildnlp/datasets/conll.py @@ -54,7 +54,7 @@ def load(self, path): processed = self._process_sample(sample) self._data.append(processed) - def apply(self, aspect, apply_to_ne=False): + def apply(self, aspect, apply_to_ne=False, apply_to_both=False): """ :param aspect: transformation function @@ -62,6 +62,9 @@ def apply(self, aspect, apply_to_ne=False): :param apply_to_ne: if `False`, transformation won't be applied to Named Entities. If `True`, transformation will be applied only to Named Entities. + :param apply_to_both: if `True`, transformation will be applied + to both the Named Entities and other tokens. + :return: modified dataset in the following form: @@ -82,10 +85,13 @@ def apply(self, aspect, apply_to_ne=False): for entry in self._data: tags = entry['ner_tags'] - if apply_to_ne is False: - non_ner = np.where(tags == 'O')[0] + if not apply_to_both: + if apply_to_ne is False: + non_ner = np.where(tags == 'O')[0] + else: + non_ner = np.where(tags != 'O')[0] else: - non_ner = np.where(tags != 'O')[0] + non_ner = range(len(entry['tokens'])) if len(non_ner) == 0: modified.append(entry) From 95accbfa52cedf855eecc642823ca836cd4823ac Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Thu, 18 Jul 2019 16:12:40 +0800 Subject: [PATCH 4/6] Simplifying the code --- wildnlp/aspects/lowercase.py | 43 +++++++----------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py index 48755d0..1ac5309 100644 --- a/wildnlp/aspects/lowercase.py +++ b/wildnlp/aspects/lowercase.py @@ -1,45 +1,18 @@ -import random - from .base import Aspect class LowerCase(Aspect): - """Randomly lowercases the dataset + """Lower-cases the dataset - .. caution:: Uses random numbers, default seed is 42. """ - def __init__(self, words_percentage=50, seed=42): - """ - - :param words_percentage: Percentage of the words in the dataset which needs to be lowercased. - Defaults to 0.5. - - :param seed: Random seed. - """ - - if words_percentage >= 1: - words_percentage /= 100. - - random.seed(seed) - def __call__(self, sentence): - tokens = self._tokenize(sentence) - lowercased_tokens = self._lowercase_tokens(tokens) - return self._detokenize(lowercased_tokens) - - @staticmethod - def _lowercase_tokens(tokens): - - modified = [] - for token in tokens: - token = token.lower() - modified.append(token) - return modified + return " ".join([self._lowercase_word(word) + if word != '' else '' + for word in sentence.split(' ')]) @staticmethod - def _capitalize(token): - try: - return token[0].upper() + token[1:] - except IndexError: - return token + def _lowercase_word(word): + if len(word) == 0: + raise ValueError("Can't lowercase empty words") + return word.lower() From 01b87b4c296c216085574d1ff83ffe50d90c4de7 Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Thu, 18 Jul 2019 16:45:33 +0800 Subject: [PATCH 5/6] Add tests --- tests/aspects/test_lowercase.py | 12 ++++++++++++ wildnlp/aspects/__init__.py | 1 + 2 files changed, 13 insertions(+) create mode 100644 tests/aspects/test_lowercase.py diff --git a/tests/aspects/test_lowercase.py b/tests/aspects/test_lowercase.py new file mode 100644 index 0000000..d16e1b2 --- /dev/null +++ b/tests/aspects/test_lowercase.py @@ -0,0 +1,12 @@ +from wildnlp.aspects import LowerCase + + +def test_single_word(): + assert LowerCase()("Language") == "language" + + +def test_sentence(): + sentence = "EU rejects German call to boycott British lamb." + transformed = LowerCase()(sentence) + + assert transformed == "eu rejects german call to boycott british lamb." diff --git a/wildnlp/aspects/__init__.py b/wildnlp/aspects/__init__.py index 8ed6fd8..36c8e3d 100644 --- a/wildnlp/aspects/__init__.py +++ b/wildnlp/aspects/__init__.py @@ -9,3 +9,4 @@ from .change_char import ChangeChar from .white_spaces import WhiteSpaces from .add_sub_string import AddSubString +from .lowercase import LowerCase From 8096acf14218ef036f7bec63577d4b28db52373d Mon Sep 17 00:00:00 2001 From: Mayank Lunayach Date: Thu, 18 Jul 2019 16:58:17 +0800 Subject: [PATCH 6/6] Add to docs --- docs/source/aspects.rst | 8 ++++++++ wildnlp/aspects/lowercase.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source/aspects.rst b/docs/source/aspects.rst index 7a42dc2..1f3143d 100644 --- a/docs/source/aspects.rst +++ b/docs/source/aspects.rst @@ -100,6 +100,14 @@ WhiteSpaces ----------------------- .. autoclass:: wildnlp.aspects.white_spaces.WhiteSpaces + :members: + :special-members: __init__ + :show-inheritance: + +Lowercase +----------------------- + +.. autoclass:: wildnlp.aspects.lowercase.LowerCase :members: :special-members: __init__ :show-inheritance: \ No newline at end of file diff --git a/wildnlp/aspects/lowercase.py b/wildnlp/aspects/lowercase.py index 1ac5309..f297430 100644 --- a/wildnlp/aspects/lowercase.py +++ b/wildnlp/aspects/lowercase.py @@ -2,7 +2,7 @@ class LowerCase(Aspect): - """Lower-cases the dataset + """Lower-cases the dataset. """