diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index b0c5d6a..c7c9113 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -1,41 +1,91 @@ -from typing import List +from typing import List, Tuple +import re +from .toolkit.search_all import search_all +from .toolkit.transform import transform from .base_factory import BaseFactory +PATTERNS = { + "left": re.compile(r"\A\s+"), + "right": re.compile(r"\s+\Z"), + "both": re.compile(r"\A\s+|\s+\Z"), + "rep": '', +} + + class Strip(BaseFactory): def __init__( self, - chars: List[str] = None, direction: str = 'both', name: str = 'strip', ): - self.chars = chars - if self.chars is None: - self.chars_str = None - else: - self.chars_str = ''.join(chars) if direction not in ['both', 'left', 'right']: raise ValueError( - 'WRONG direction input! ' + 'Not Supported Yet!!' 'Direction has three options [both, left, right]', 'Your input is {}'.format(direction), ) else: self.direction = direction + self.pattern = PATTERNS[direction] + super().__init__( - name=name + '_' + self.direction + '_' + str(self.chars_str), - denormalizable=False, + name=name + '_' + self.direction, + denormalizable=True, ) + @staticmethod + def gen_backward_annotations( + forward_annotations: List[Tuple[int, int, str]]): + output = [] + offset = 0 + for anno in forward_annotations: + new_start = offset + anno[0] + new_end = new_start + len(PATTERNS["rep"]) + output.append( + ( + new_start, + new_end, + PATTERNS["rep"], + ), + ) + offset = new_end - anno[1] + return output + def normalize( self, sentence: str, ) -> (str, List[dict]): - if self.direction == 'both': - return sentence.strip(self.chars_str), None - elif self.direction == 'left': - return sentence.lstrip(self.chars_str), None - elif self.direction == 'right': - return sentence.rstrip(self.chars_str), None + + forward_annotations = search_all( + input_str=sentence, + reg_pattern=self.pattern, + ) + backward_annotations = self.gen_backward_annotations( + forward_annotations) + + output_str = transform( + input_str=sentence, + forward_annotations=forward_annotations, + backward_annotations=backward_annotations, + ) + return output_str, { + 'forward': forward_annotations, + 'backward': backward_annotations, + } + + def denormalize( + self, + sentence: str, + meta: dict, + ) -> str: + forward_annotations = meta['forward'] + backward_annotations = meta['backward'] + output_str = transform( + input_str=sentence, + forward_annotations=backward_annotations, + backward_annotations=forward_annotations, + ) + return output_str diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py index 2f5334a..f8b81b2 100644 --- a/text_normalizer/factory/test/test_strip.py +++ b/text_normalizer/factory/test/test_strip.py @@ -1,72 +1,184 @@ -# -*- coding: utf-8 -*- from unittest import TestCase from ..strip import Strip -class StripTestCase(TestCase): +class StripTemplate: + + def test_normalize(self): + for i in range(len(self.test_cases)): + test_case = self.test_cases[i] + with self.subTest(i=i): + result = self.normalizer.normalize( + test_case['input'], + ) + self.assertEqual( + test_case['output'], + result[0], + ) + self.assertEqual( + test_case['meta'], + result[1], + ) + + def test_invertible(self): + for i in range(len(self.test_cases)): + test_case = self.test_cases[i] + with self.subTest(i=test_case): + result = self.normalizer.normalize( + test_case['input'], + ) + output = self.normalizer.denormalize( + result[0], + result[1], + ) + self.assertEqual( + test_case['input'], + output, + ) - def setUp(self): - self.strip_text_normalizer_default = Strip() - self.strip_text_normalizer_left = Strip( - direction='left', - chars=['#', ' '], - ) - self.strip_text_normalizer_right = Strip( - direction='right', - chars=['/', ' '], - ) - def test_attributes(self): - self.assertEqual( +class StripDefaultTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip() + cls.test_cases = [ { - 'chars': None, - 'chars_str': None, - 'direction': 'both', - 'denormalizable': False, - 'name': 'strip_both_None', + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': 'HAHA', + 'meta': { + 'forward': [(0, 5, ' \n\t\n '), (9, 17, '\t \t \n \n ')], + 'backward': [(0, 0, ''), (4, 4, '')], + }, }, - self.strip_text_normalizer_default.__dict__, - ) - self.assertEqual( { - 'chars': ['#', ' '], - 'chars_str': '# ', - 'direction': 'left', - 'denormalizable': False, - 'name': 'strip_left_# ', + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], + }, }, - self.strip_text_normalizer_left.__dict__, - ) - self.assertEqual( { - 'chars': ['/', ' '], - 'chars_str': '/ ', - 'direction': 'right', - 'denormalizable': False, - 'name': 'strip_right_/ ', + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ', + 'meta': { + 'forward': [(10, 15, '\t\t\n\n ')], + 'backward': [(10, 10, '')], + }, }, - self.strip_text_normalizer_right.__dict__, - ) + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '我的空白在前面ㄏㄏ', + 'meta': { + 'forward': [(0, 7, '\t\t \n\n')], + 'backward': [(0, 0, '')], + }, + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'forward': [], + 'backward': [], + }, + }, + ] - def test_normalize(self): - result = self.strip_text_normalizer_default.normalize( - sentence=' HAHA ', - ) - self.assertEqual( - ('HAHA', None), - result, - ) - result = self.strip_text_normalizer_left.normalize( - sentence='## \t\tHAHA', - ) - self.assertEqual( - ('\t\tHAHA', None), - result, - ) - result = self.strip_text_normalizer_right.normalize( - sentence='HAHA\t\t/// ', - ) - self.assertEqual( - ('HAHA\t\t', None), - result, - ) + +class StripLeftTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip(direction='left') + cls.test_cases = [ + { + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': 'HAHA\t \t \n \n ', + 'meta': { + 'forward': [(0, 5, ' \n\t\n ')], + 'backward': [(0, 0, '')], + }, + }, + { + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], + }, + }, + { + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'meta': { + 'forward': [], + 'backward': [], + }, + }, + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '我的空白在前面ㄏㄏ', + 'meta': { + 'forward': [(0, 7, '\t\t \n\n')], + 'backward': [(0, 0, '')], + }, + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'forward': [], + 'backward': [], + }, + }, + ] + + +class StripRightTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip(direction='right') + cls.test_cases = [ + { + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': ' \n\t\n HAHA', + 'meta': { + 'forward': [(9, 17, '\t \t \n \n ')], + 'backward': [(9, 9, '')], + }, + }, + { + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], + }, + }, + { + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ', + 'meta': { + 'forward': [(10, 15, '\t\t\n\n ')], + 'backward': [(10, 10, '')], + }, + }, + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '\t\t \n\n我的空白在前面ㄏㄏ', + 'meta': { + 'forward': [], + 'backward': [], + }, + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'forward': [], + 'backward': [], + }, + }, + ]