From b9e1c1be18843df9deb295d299398e06bd5f616a Mon Sep 17 00:00:00 2001 From: Solumilken Date: Mon, 1 Oct 2018 15:59:59 +0800 Subject: [PATCH 1/5] denormalizable and new format of meta --- text_normalizer/factory/strip.py | 87 ++++++-- text_normalizer/factory/test/test_strip.py | 230 +++++++++++++++------ 2 files changed, 243 insertions(+), 74 deletions(-) diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index b0c5d6a..7098ba0 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -1,21 +1,26 @@ -from typing import List +from typing import List, Tuple +import re +from .toolkit.search_all import search_all +from .toolkit.transform import transform from .base_factory import BaseFactory +PATTERNS = { + "left": re.compile(r"\A\s+.{0,1}"), + "right": re.compile(r".{0,1}\s+\Z"), + "both": re.compile(r"\A\s+.{0,1}|.{0,1}\s+\Z"), + "rep": re.compile(r"[^\s]+"), +} + + class Strip(BaseFactory): def __init__( self, - chars: List[str] = None, direction: str = 'both', name: str = 'strip', ): - self.chars = chars - if self.chars is None: - self.chars_str = None - else: - self.chars_str = ''.join(chars) if direction not in ['both', 'left', 'right']: raise ValueError( 'WRONG direction input! ' @@ -24,18 +29,70 @@ def __init__( ) else: self.direction = direction + self.pattern = PATTERNS[direction] + super().__init__( - name=name + '_' + self.direction + '_' + str(self.chars_str), - denormalizable=False, + name=name + '_' + self.direction, + denormalizable=True, ) + + @staticmethod + def gen_backward_annotations( + forward_annotations: List[Tuple[int, int, str]]): + output = [] + offset = 0 + for anno in forward_annotations: + rep = PATTERNS["rep"].findall(anno[2]) + if len(rep) > 0: + rep = rep[0] + else: + rep = "" + new_start = offset + anno[0] + new_end = new_start + len(rep) + output.append( + ( + new_start, + new_end, + rep, + ), + ) + offset = new_end - anno[1] + return output + + def normalize( self, sentence: str, ) -> (str, List[dict]): - if self.direction == 'both': - return sentence.strip(self.chars_str), None - elif self.direction == 'left': - return sentence.lstrip(self.chars_str), None - elif self.direction == 'right': - return sentence.rstrip(self.chars_str), None + + forward_annotations = search_all( + input_str=sentence, + reg_pattern=self.pattern, + ) + backward_annotations = self.gen_backward_annotations( + forward_annotations) + + output_str = transform( + input_str=sentence, + forward_annotations=forward_annotations, + backward_annotations=backward_annotations, + ) + return output_str, { + 'for': forward_annotations, + 'back': backward_annotations, + } + + def denormalize( + self, + sentence: str, + meta: dict, + ) -> str: + forward_annotations = meta['for'] + backward_annotations = meta['back'] + output_str = transform( + input_str=sentence, + forward_annotations=backward_annotations, + backward_annotations=forward_annotations, + ) + return output_str diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py index 2f5334a..41abd5c 100644 --- a/text_normalizer/factory/test/test_strip.py +++ b/text_normalizer/factory/test/test_strip.py @@ -1,72 +1,184 @@ -# -*- coding: utf-8 -*- from unittest import TestCase from ..strip import Strip -class StripTestCase(TestCase): +class StripTemplate: + + def test_normalize(self): + for i in range(len(self.test_cases)): + test_case = self.test_cases[i] + with self.subTest(i=test_case): + result = self.normalizer.normalize( + test_case['input'] + ) + self.assertEqual( + test_case['output'], + result[0], + ) + self.assertEqual( + test_case['meta'], + result[1], + ) + + def test_invertible(self): + for i in range(len(self.test_cases)): + test_case = self.test_cases[i] + with self.subTest(i=test_case): + result = self.normalizer.normalize( + test_case['input'] + ) + output = self.normalizer.denormalize( + result[0], + result[1] + ) + self.assertEqual( + test_case['input'], + output, + ) - def setUp(self): - self.strip_text_normalizer_default = Strip() - self.strip_text_normalizer_left = Strip( - direction='left', - chars=['#', ' '], - ) - self.strip_text_normalizer_right = Strip( - direction='right', - chars=['/', ' '], - ) - def test_attributes(self): - self.assertEqual( +class StripDefaultTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip() + cls.test_cases = [ { - 'chars': None, - 'chars_str': None, - 'direction': 'both', - 'denormalizable': False, - 'name': 'strip_both_None', + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': 'HAHA', + 'meta': { + 'for': [(0, 6, ' \n\t\n H'), (8, 17, 'A\t \t \n \n ')], + 'back': [(0, 1, 'H'), (3, 4, 'A')], + }, }, - self.strip_text_normalizer_default.__dict__, - ) - self.assertEqual( { - 'chars': ['#', ' '], - 'chars_str': '# ', - 'direction': 'left', - 'denormalizable': False, - 'name': 'strip_left_# ', + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'for': [(0, 8, ' \n \t \t \n')], + 'back': [(0, 0, '')], + }, }, - self.strip_text_normalizer_left.__dict__, - ) - self.assertEqual( { - 'chars': ['/', ' '], - 'chars_str': '/ ', - 'direction': 'right', - 'denormalizable': False, - 'name': 'strip_right_/ ', + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ', + 'meta': { + 'for': [(9, 15, 'Q\t\t\n\n ')], + 'back': [(9, 10, 'Q')], + }, }, - self.strip_text_normalizer_right.__dict__, - ) + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '我的空白在前面ㄏㄏ', + 'meta': { + 'for': [(0, 8, '\t\t \n\n我')], + 'back': [(0, 1, '我')], + } + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'for': [], + 'back': [], + } + }, + ] - def test_normalize(self): - result = self.strip_text_normalizer_default.normalize( - sentence=' HAHA ', - ) - self.assertEqual( - ('HAHA', None), - result, - ) - result = self.strip_text_normalizer_left.normalize( - sentence='## \t\tHAHA', - ) - self.assertEqual( - ('\t\tHAHA', None), - result, - ) - result = self.strip_text_normalizer_right.normalize( - sentence='HAHA\t\t/// ', - ) - self.assertEqual( - ('HAHA\t\t', None), - result, - ) + +class StripLeftTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip(direction='left') + cls.test_cases = [ + { + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': 'HAHA\t \t \n \n ', + 'meta': { + 'for': [(0, 6, ' \n\t\n H')], + 'back': [(0, 1, 'H')], + }, + }, + { + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'for': [(0, 8, ' \n \t \t \n')], + 'back': [(0, 0, '')], + }, + }, + { + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'meta': { + 'for': [], + 'back': [], + }, + }, + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '我的空白在前面ㄏㄏ', + 'meta': { + 'for': [(0, 8, '\t\t \n\n我')], + 'back': [(0, 1, '我')], + } + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'for': [], + 'back': [], + } + }, + ] + + +class StripRightTestCase(StripTemplate, TestCase): + + @classmethod + def setUpClass(cls): + cls.normalizer = Strip(direction='right') + cls.test_cases = [ + { + 'input': ' \n\t\n HAHA\t \t \n \n ', + 'output': ' \n\t\n HAHA', + 'meta': { + 'for': [(8, 17, 'A\t \t \n \n ')], + 'back': [(8, 9, 'A')], + }, + }, + { + 'input': ' \n \t \t \n', + 'output': '', + 'meta': { + 'for': [(0, 8, ' \n \t \t \n')], + 'back': [(0, 0, '')], + }, + }, + { + 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', + 'output': '黃金曼特寧好苦QAQ', + 'meta': { + 'for': [(9, 15, 'Q\t\t\n\n ')], + 'back': [(9, 10, 'Q')], + }, + }, + { + 'input': '\t\t \n\n我的空白在前面ㄏㄏ', + 'output': '\t\t \n\n我的空白在前面ㄏㄏ', + 'meta': { + 'for': [], + 'back': [], + } + }, + { + 'input': '隼興大大是專業HR', + 'output': '隼興大大是專業HR', + 'meta': { + 'for': [], + 'back': [], + } + }, + ] From d14ff69332777e572e9b5ddb505eed0e8399adc5 Mon Sep 17 00:00:00 2001 From: Solumilken Date: Mon, 1 Oct 2018 17:36:46 +0800 Subject: [PATCH 2/5] fit flake8 --- text_normalizer/factory/strip.py | 2 -- text_normalizer/factory/test/test_strip.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index 7098ba0..1d32449 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -36,7 +36,6 @@ def __init__( denormalizable=True, ) - @staticmethod def gen_backward_annotations( forward_annotations: List[Tuple[int, int, str]]): @@ -60,7 +59,6 @@ def gen_backward_annotations( offset = new_end - anno[1] return output - def normalize( self, sentence: str, diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py index 41abd5c..8163d0b 100644 --- a/text_normalizer/factory/test/test_strip.py +++ b/text_normalizer/factory/test/test_strip.py @@ -9,7 +9,7 @@ def test_normalize(self): test_case = self.test_cases[i] with self.subTest(i=test_case): result = self.normalizer.normalize( - test_case['input'] + test_case['input'], ) self.assertEqual( test_case['output'], @@ -25,11 +25,11 @@ def test_invertible(self): test_case = self.test_cases[i] with self.subTest(i=test_case): result = self.normalizer.normalize( - test_case['input'] + test_case['input'], ) output = self.normalizer.denormalize( result[0], - result[1] + result[1], ) self.assertEqual( test_case['input'], @@ -73,7 +73,7 @@ def setUpClass(cls): 'meta': { 'for': [(0, 8, '\t\t \n\n我')], 'back': [(0, 1, '我')], - } + }, }, { 'input': '隼興大大是專業HR', @@ -81,7 +81,7 @@ def setUpClass(cls): 'meta': { 'for': [], 'back': [], - } + }, }, ] @@ -122,7 +122,7 @@ def setUpClass(cls): 'meta': { 'for': [(0, 8, '\t\t \n\n我')], 'back': [(0, 1, '我')], - } + }, }, { 'input': '隼興大大是專業HR', @@ -130,7 +130,7 @@ def setUpClass(cls): 'meta': { 'for': [], 'back': [], - } + }, }, ] @@ -171,7 +171,7 @@ def setUpClass(cls): 'meta': { 'for': [], 'back': [], - } + }, }, { 'input': '隼興大大是專業HR', @@ -179,6 +179,6 @@ def setUpClass(cls): 'meta': { 'for': [], 'back': [], - } + }, }, ] From 423a082c4f6c9888b985e291e83eb37a3f690fb3 Mon Sep 17 00:00:00 2001 From: Solumilken Date: Tue, 2 Oct 2018 16:03:48 +0800 Subject: [PATCH 3/5] clearer key name --- text_normalizer/factory/strip.py | 10 ++-- text_normalizer/factory/test/test_strip.py | 60 +++++++++++----------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index 1d32449..f28c262 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -23,7 +23,7 @@ def __init__( ): if direction not in ['both', 'left', 'right']: raise ValueError( - 'WRONG direction input! ' + 'Not Supported Yet!!' 'Direction has three options [both, left, right]', 'Your input is {}'.format(direction), ) @@ -77,8 +77,8 @@ def normalize( backward_annotations=backward_annotations, ) return output_str, { - 'for': forward_annotations, - 'back': backward_annotations, + 'forward': forward_annotations, + 'backward': backward_annotations, } def denormalize( @@ -86,8 +86,8 @@ def denormalize( sentence: str, meta: dict, ) -> str: - forward_annotations = meta['for'] - backward_annotations = meta['back'] + forward_annotations = meta['forward'] + backward_annotations = meta['backward'] output_str = transform( input_str=sentence, forward_annotations=backward_annotations, diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py index 8163d0b..9074e36 100644 --- a/text_normalizer/factory/test/test_strip.py +++ b/text_normalizer/factory/test/test_strip.py @@ -47,40 +47,40 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': 'HAHA', 'meta': { - 'for': [(0, 6, ' \n\t\n H'), (8, 17, 'A\t \t \n \n ')], - 'back': [(0, 1, 'H'), (3, 4, 'A')], + 'forward': [(0, 6, ' \n\t\n H'), (8, 17, 'A\t \t \n \n ')], + 'backward': [(0, 1, 'H'), (3, 4, 'A')], }, }, { 'input': ' \n \t \t \n', 'output': '', 'meta': { - 'for': [(0, 8, ' \n \t \t \n')], - 'back': [(0, 0, '')], + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], }, }, { 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', 'output': '黃金曼特寧好苦QAQ', 'meta': { - 'for': [(9, 15, 'Q\t\t\n\n ')], - 'back': [(9, 10, 'Q')], + 'forward': [(9, 15, 'Q\t\t\n\n ')], + 'backward': [(9, 10, 'Q')], }, }, { 'input': '\t\t \n\n我的空白在前面ㄏㄏ', 'output': '我的空白在前面ㄏㄏ', 'meta': { - 'for': [(0, 8, '\t\t \n\n我')], - 'back': [(0, 1, '我')], + 'forward': [(0, 8, '\t\t \n\n我')], + 'backward': [(0, 1, '我')], }, }, { 'input': '隼興大大是專業HR', 'output': '隼興大大是專業HR', 'meta': { - 'for': [], - 'back': [], + 'forward': [], + 'backward': [], }, }, ] @@ -96,40 +96,40 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': 'HAHA\t \t \n \n ', 'meta': { - 'for': [(0, 6, ' \n\t\n H')], - 'back': [(0, 1, 'H')], + 'forward': [(0, 6, ' \n\t\n H')], + 'backward': [(0, 1, 'H')], }, }, { 'input': ' \n \t \t \n', 'output': '', 'meta': { - 'for': [(0, 8, ' \n \t \t \n')], - 'back': [(0, 0, '')], + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], }, }, { 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', 'output': '黃金曼特寧好苦QAQ\t\t\n\n ', 'meta': { - 'for': [], - 'back': [], + 'forward': [], + 'backward': [], }, }, { 'input': '\t\t \n\n我的空白在前面ㄏㄏ', 'output': '我的空白在前面ㄏㄏ', 'meta': { - 'for': [(0, 8, '\t\t \n\n我')], - 'back': [(0, 1, '我')], + 'forward': [(0, 8, '\t\t \n\n我')], + 'backward': [(0, 1, '我')], }, }, { 'input': '隼興大大是專業HR', 'output': '隼興大大是專業HR', 'meta': { - 'for': [], - 'back': [], + 'forward': [], + 'backward': [], }, }, ] @@ -145,40 +145,40 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': ' \n\t\n HAHA', 'meta': { - 'for': [(8, 17, 'A\t \t \n \n ')], - 'back': [(8, 9, 'A')], + 'forward': [(8, 17, 'A\t \t \n \n ')], + 'backward': [(8, 9, 'A')], }, }, { 'input': ' \n \t \t \n', 'output': '', 'meta': { - 'for': [(0, 8, ' \n \t \t \n')], - 'back': [(0, 0, '')], + 'forward': [(0, 8, ' \n \t \t \n')], + 'backward': [(0, 0, '')], }, }, { 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', 'output': '黃金曼特寧好苦QAQ', 'meta': { - 'for': [(9, 15, 'Q\t\t\n\n ')], - 'back': [(9, 10, 'Q')], + 'forward': [(9, 15, 'Q\t\t\n\n ')], + 'backward': [(9, 10, 'Q')], }, }, { 'input': '\t\t \n\n我的空白在前面ㄏㄏ', 'output': '\t\t \n\n我的空白在前面ㄏㄏ', 'meta': { - 'for': [], - 'back': [], + 'forward': [], + 'backward': [], }, }, { 'input': '隼興大大是專業HR', 'output': '隼興大大是專業HR', 'meta': { - 'for': [], - 'back': [], + 'forward': [], + 'backward': [], }, }, ] From e9e600c9602eace420e726932183c5c78a9219bb Mon Sep 17 00:00:00 2001 From: Solumilken Date: Fri, 12 Oct 2018 14:37:46 +0800 Subject: [PATCH 4/5] do not extract char not in \s --- text_normalizer/factory/strip.py | 22 ++++++++-------- text_normalizer/factory/test/test_strip.py | 30 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index f28c262..09611c1 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -7,10 +7,10 @@ PATTERNS = { - "left": re.compile(r"\A\s+.{0,1}"), - "right": re.compile(r".{0,1}\s+\Z"), - "both": re.compile(r"\A\s+.{0,1}|.{0,1}\s+\Z"), - "rep": re.compile(r"[^\s]+"), + "left": re.compile(r"\A\s+"), + "right": re.compile(r"\s+\Z"), + "both": re.compile(r"\A\s+|\s+\Z"), + "rep": '', } @@ -42,18 +42,18 @@ def gen_backward_annotations( output = [] offset = 0 for anno in forward_annotations: - rep = PATTERNS["rep"].findall(anno[2]) - if len(rep) > 0: - rep = rep[0] - else: - rep = "" + # rep = PATTERNS["rep"].findall(anno[2]) + # if len(rep) > 0: + # rep = rep[0] + # else: + # rep = "" new_start = offset + anno[0] - new_end = new_start + len(rep) + new_end = new_start + len(PATTERNS["rep"]) output.append( ( new_start, new_end, - rep, + PATTERNS["rep"], ), ) offset = new_end - anno[1] diff --git a/text_normalizer/factory/test/test_strip.py b/text_normalizer/factory/test/test_strip.py index 9074e36..f8b81b2 100644 --- a/text_normalizer/factory/test/test_strip.py +++ b/text_normalizer/factory/test/test_strip.py @@ -7,7 +7,7 @@ class StripTemplate: def test_normalize(self): for i in range(len(self.test_cases)): test_case = self.test_cases[i] - with self.subTest(i=test_case): + with self.subTest(i=i): result = self.normalizer.normalize( test_case['input'], ) @@ -47,8 +47,8 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': 'HAHA', 'meta': { - 'forward': [(0, 6, ' \n\t\n H'), (8, 17, 'A\t \t \n \n ')], - 'backward': [(0, 1, 'H'), (3, 4, 'A')], + 'forward': [(0, 5, ' \n\t\n '), (9, 17, '\t \t \n \n ')], + 'backward': [(0, 0, ''), (4, 4, '')], }, }, { @@ -63,16 +63,16 @@ def setUpClass(cls): 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', 'output': '黃金曼特寧好苦QAQ', 'meta': { - 'forward': [(9, 15, 'Q\t\t\n\n ')], - 'backward': [(9, 10, 'Q')], + 'forward': [(10, 15, '\t\t\n\n ')], + 'backward': [(10, 10, '')], }, }, { 'input': '\t\t \n\n我的空白在前面ㄏㄏ', 'output': '我的空白在前面ㄏㄏ', 'meta': { - 'forward': [(0, 8, '\t\t \n\n我')], - 'backward': [(0, 1, '我')], + 'forward': [(0, 7, '\t\t \n\n')], + 'backward': [(0, 0, '')], }, }, { @@ -96,8 +96,8 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': 'HAHA\t \t \n \n ', 'meta': { - 'forward': [(0, 6, ' \n\t\n H')], - 'backward': [(0, 1, 'H')], + 'forward': [(0, 5, ' \n\t\n ')], + 'backward': [(0, 0, '')], }, }, { @@ -120,8 +120,8 @@ def setUpClass(cls): 'input': '\t\t \n\n我的空白在前面ㄏㄏ', 'output': '我的空白在前面ㄏㄏ', 'meta': { - 'forward': [(0, 8, '\t\t \n\n我')], - 'backward': [(0, 1, '我')], + 'forward': [(0, 7, '\t\t \n\n')], + 'backward': [(0, 0, '')], }, }, { @@ -145,8 +145,8 @@ def setUpClass(cls): 'input': ' \n\t\n HAHA\t \t \n \n ', 'output': ' \n\t\n HAHA', 'meta': { - 'forward': [(8, 17, 'A\t \t \n \n ')], - 'backward': [(8, 9, 'A')], + 'forward': [(9, 17, '\t \t \n \n ')], + 'backward': [(9, 9, '')], }, }, { @@ -161,8 +161,8 @@ def setUpClass(cls): 'input': '黃金曼特寧好苦QAQ\t\t\n\n ', 'output': '黃金曼特寧好苦QAQ', 'meta': { - 'forward': [(9, 15, 'Q\t\t\n\n ')], - 'backward': [(9, 10, 'Q')], + 'forward': [(10, 15, '\t\t\n\n ')], + 'backward': [(10, 10, '')], }, }, { From 2428a2d077cd7673a9d1626ac7f9d6fb54a1323a Mon Sep 17 00:00:00 2001 From: Solumilken Date: Fri, 12 Oct 2018 14:44:26 +0800 Subject: [PATCH 5/5] removed redundant --- text_normalizer/factory/strip.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/text_normalizer/factory/strip.py b/text_normalizer/factory/strip.py index 09611c1..c7c9113 100644 --- a/text_normalizer/factory/strip.py +++ b/text_normalizer/factory/strip.py @@ -42,11 +42,6 @@ def gen_backward_annotations( output = [] offset = 0 for anno in forward_annotations: - # rep = PATTERNS["rep"].findall(anno[2]) - # if len(rep) > 0: - # rep = rep[0] - # else: - # rep = "" new_start = offset + anno[0] new_end = new_start + len(PATTERNS["rep"]) output.append(