Yoctol · SoluMilken · Oct 1, 2018 · Oct 1, 2018 · Oct 2, 2018 · Oct 12, 2018
diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py
@@ -1,41 +1,91 @@
-from typing import List
+from typing import List, Tuple
+import re
 
+from .toolkit.search_all import search_all
+from .toolkit.transform import transform
 from .base_factory import BaseFactory
 
 
+PATTERNS = {
+    "left": re.compile(r"\A\s+"),
+    "right": re.compile(r"\s+\Z"),
+    "both": re.compile(r"\A\s+|\s+\Z"),
+    "rep": '',
+}
+
+
 class Strip(BaseFactory):
 
     def __init__(
             self,
-            chars: List[str] = None,
             direction: str = 'both',
             name: str = 'strip',
         ):
-        self.chars = chars
-        if self.chars is None:
-            self.chars_str = None
-        else:
-            self.chars_str = ''.join(chars)
         if direction not in ['both', 'left', 'right']:
             raise ValueError(
-                'WRONG direction input! '
+                'Not Supported Yet!!'
                 'Direction has three options [both, left, right]',
                 'Your input is {}'.format(direction),
             )
         else:
             self.direction = direction
+            self.pattern = PATTERNS[direction]
+
         super().__init__(
-            name=name + '_' + self.direction + '_' + str(self.chars_str),
-            denormalizable=False,
+            name=name + '_' + self.direction,
+            denormalizable=True,
         )
 
+    @staticmethod
+    def gen_backward_annotations(
+        forward_annotations: List[Tuple[int, int, str]]):
+        output = []
+        offset = 0
+        for anno in forward_annotations:
+            new_start = offset + anno[0]
+            new_end = new_start + len(PATTERNS["rep"])
+            output.append(
+                (
+                    new_start,
+                    new_end,
+                    PATTERNS["rep"],
+                ),
+            )
+            offset = new_end - anno[1]
+        return output
+
     def normalize(
             self,
             sentence: str,
         ) -> (str, List[dict]):
-        if self.direction == 'both':
-            return sentence.strip(self.chars_str), None
-        elif self.direction == 'left':
-            return sentence.lstrip(self.chars_str), None
-        elif self.direction == 'right':
-            return sentence.rstrip(self.chars_str), None
+
+        forward_annotations = search_all(
+            input_str=sentence,
+            reg_pattern=self.pattern,
+        )
+        backward_annotations = self.gen_backward_annotations(
+            forward_annotations)
+
+        output_str = transform(
+            input_str=sentence,
+            forward_annotations=forward_annotations,
+            backward_annotations=backward_annotations,
+        )
+        return output_str, {
+            'forward': forward_annotations,
+            'backward': backward_annotations,
+        }
+
+    def denormalize(
+            self,
+            sentence: str,
+            meta: dict,
+        ) -> str:
+        forward_annotations = meta['forward']
+        backward_annotations = meta['backward']
+        output_str = transform(
+            input_str=sentence,
+            forward_annotations=backward_annotations,
+            backward_annotations=forward_annotations,
+        )
+        return output_str
diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py
@@ -1,72 +1,184 @@
-# -*- coding: utf-8 -*-
 from unittest import TestCase
 from ..strip import Strip
 
 
-class StripTestCase(TestCase):
+class StripTemplate:
+
+    def test_normalize(self):
+        for i in range(len(self.test_cases)):
+            test_case = self.test_cases[i]
+            with self.subTest(i=i):
+                result = self.normalizer.normalize(
+                    test_case['input'],
+                )
+                self.assertEqual(
+                    test_case['output'],
+                    result[0],
+                )
+                self.assertEqual(
+                    test_case['meta'],
+                    result[1],
+                )
+
+    def test_invertible(self):
+        for i in range(len(self.test_cases)):
+            test_case = self.test_cases[i]
+            with self.subTest(i=test_case):
+                result = self.normalizer.normalize(
+                    test_case['input'],
+                )
+                output = self.normalizer.denormalize(
+                    result[0],
+                    result[1],
+                )
+                self.assertEqual(
+                    test_case['input'],
+                    output,
+                )
 
-    def setUp(self):
-        self.strip_text_normalizer_default = Strip()
-        self.strip_text_normalizer_left = Strip(
-            direction='left',
-            chars=['#', ' '],
-        )
-        self.strip_text_normalizer_right = Strip(
-            direction='right',
-            chars=['/', ' '],
-        )
 
-    def test_attributes(self):
-        self.assertEqual(
+class StripDefaultTestCase(StripTemplate, TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.normalizer = Strip()
+        cls.test_cases = [
             {
-                'chars': None,
-                'chars_str': None,
-                'direction': 'both',
-                'denormalizable': False,
-                'name': 'strip_both_None',
+                'input': ' \n\t\n HAHA\t \t \n \n ',
+                'output': 'HAHA',
+                'meta': {
+                    'forward': [(0, 5, ' \n\t\n '), (9, 17, '\t \t \n \n ')],
+                    'backward': [(0, 0, ''), (4, 4, '')],
+                },
             },
-            self.strip_text_normalizer_default.__dict__,
-        )
-        self.assertEqual(
             {
-                'chars': ['#', ' '],
-                'chars_str': '# ',
-                'direction': 'left',
-                'denormalizable': False,
-                'name': 'strip_left_# ',
+                'input': ' \n \t \t \n',
+                'output': '',
+                'meta': {
+                    'forward': [(0, 8, ' \n \t \t \n')],
+                    'backward': [(0, 0, '')],
+                },
             },
-            self.strip_text_normalizer_left.__dict__,
-        )
-        self.assertEqual(
             {
-                'chars': ['/', ' '],
-                'chars_str': '/ ',
-                'direction': 'right',
-                'denormalizable': False,
-                'name': 'strip_right_/ ',
+                'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
+                'output': '黃金曼特寧好苦QAQ',
+                'meta': {
+                    'forward': [(10, 15, '\t\t\n\n ')],
+                    'backward': [(10, 10, '')],
+                },
             },
-            self.strip_text_normalizer_right.__dict__,
-        )
+            {
+                'input': '\t\t   \n\n我的空白在前面ㄏㄏ',
+                'output': '我的空白在前面ㄏㄏ',
+                'meta': {
+                    'forward': [(0, 7, '\t\t   \n\n')],
+                    'backward': [(0, 0, '')],
+                },
+            },
+            {
+                'input': '隼興大大是專業HR',
+                'output': '隼興大大是專業HR',
+                'meta': {
+                    'forward': [],
+                    'backward': [],
+                },
+            },
+        ]
 
-    def test_normalize(self):
-        result = self.strip_text_normalizer_default.normalize(
-            sentence='         HAHA               ',
-        )
-        self.assertEqual(
-            ('HAHA', None),
-            result,
-        )
-        result = self.strip_text_normalizer_left.normalize(
-            sentence='##  \t\tHAHA',
-        )
-        self.assertEqual(
-            ('\t\tHAHA', None),
-            result,
-        )
-        result = self.strip_text_normalizer_right.normalize(
-            sentence='HAHA\t\t///  ',
-        )
-        self.assertEqual(
-            ('HAHA\t\t', None),
-            result,
-        )
+
+class StripLeftTestCase(StripTemplate, TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.normalizer = Strip(direction='left')
+        cls.test_cases = [
+            {
+                'input': ' \n\t\n HAHA\t \t \n \n ',
+                'output': 'HAHA\t \t \n \n ',
+                'meta': {
+                    'forward': [(0, 5, ' \n\t\n ')],
+                    'backward': [(0, 0, '')],
+                },
+            },
+            {
+                'input': ' \n \t \t \n',
+                'output': '',
+                'meta': {
+                    'forward': [(0, 8, ' \n \t \t \n')],
+                    'backward': [(0, 0, '')],
+                },
+            },
+            {
+                'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
+                'output': '黃金曼特寧好苦QAQ\t\t\n\n ',
+                'meta': {
+                    'forward': [],
+                    'backward': [],
+                },
+            },
+            {
+                'input': '\t\t   \n\n我的空白在前面ㄏㄏ',
+                'output': '我的空白在前面ㄏㄏ',
+                'meta': {
+                    'forward': [(0, 7, '\t\t   \n\n')],
+                    'backward': [(0, 0, '')],
+                },
+            },
+            {
+                'input': '隼興大大是專業HR',
+                'output': '隼興大大是專業HR',
+                'meta': {
+                    'forward': [],
+                    'backward': [],
+                },
+            },
+        ]
+
+
+class StripRightTestCase(StripTemplate, TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.normalizer = Strip(direction='right')
+        cls.test_cases = [
+            {
+                'input': ' \n\t\n HAHA\t \t \n \n ',
+                'output': ' \n\t\n HAHA',
+                'meta': {
+                    'forward': [(9, 17, '\t \t \n \n ')],
+                    'backward': [(9, 9, '')],
+                },
+            },
+            {
+                'input': ' \n \t \t \n',
+                'output': '',
+                'meta': {
+                    'forward': [(0, 8, ' \n \t \t \n')],
+                    'backward': [(0, 0, '')],
+                },
+            },
+            {
+                'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
+                'output': '黃金曼特寧好苦QAQ',
+                'meta': {
+                    'forward': [(10, 15, '\t\t\n\n ')],
+                    'backward': [(10, 10, '')],
+                },
+            },
+            {
+                'input': '\t\t   \n\n我的空白在前面ㄏㄏ',
+                'output': '\t\t   \n\n我的空白在前面ㄏㄏ',
+                'meta': {
+                    'forward': [],
+                    'backward': [],
+                },
+            },
+            {
+                'input': '隼興大大是專業HR',
+                'output': '隼興大大是專業HR',
+                'meta': {
+                    'forward': [],
+                    'backward': [],
+                },
+            },
+        ]