Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 66 additions & 16 deletions text_normalizer/factory/strip.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,91 @@
from typing import List
from typing import List, Tuple
import re

from .toolkit.search_all import search_all
from .toolkit.transform import transform
from .base_factory import BaseFactory


PATTERNS = {
"left": re.compile(r"\A\s+"),
"right": re.compile(r"\s+\Z"),
"both": re.compile(r"\A\s+|\s+\Z"),
"rep": '',
}


class Strip(BaseFactory):

def __init__(
self,
chars: List[str] = None,
direction: str = 'both',
name: str = 'strip',
):
self.chars = chars
if self.chars is None:
self.chars_str = None
else:
self.chars_str = ''.join(chars)
if direction not in ['both', 'left', 'right']:
raise ValueError(
'WRONG direction input! '
'Not Supported Yet!!'
'Direction has three options [both, left, right]',
'Your input is {}'.format(direction),
)
else:
self.direction = direction
self.pattern = PATTERNS[direction]

super().__init__(
name=name + '_' + self.direction + '_' + str(self.chars_str),
denormalizable=False,
name=name + '_' + self.direction,
denormalizable=True,
)

@staticmethod
def gen_backward_annotations(
forward_annotations: List[Tuple[int, int, str]]):
output = []
offset = 0
for anno in forward_annotations:
new_start = offset + anno[0]
new_end = new_start + len(PATTERNS["rep"])
output.append(
(
new_start,
new_end,
PATTERNS["rep"],
),
)
offset = new_end - anno[1]
return output

def normalize(
self,
sentence: str,
) -> (str, List[dict]):
if self.direction == 'both':
return sentence.strip(self.chars_str), None
elif self.direction == 'left':
return sentence.lstrip(self.chars_str), None
elif self.direction == 'right':
return sentence.rstrip(self.chars_str), None

forward_annotations = search_all(
input_str=sentence,
reg_pattern=self.pattern,
)
backward_annotations = self.gen_backward_annotations(
forward_annotations)

output_str = transform(
input_str=sentence,
forward_annotations=forward_annotations,
backward_annotations=backward_annotations,
)
return output_str, {
'forward': forward_annotations,
'backward': backward_annotations,
}

def denormalize(
self,
sentence: str,
meta: dict,
) -> str:
forward_annotations = meta['forward']
backward_annotations = meta['backward']
output_str = transform(
input_str=sentence,
forward_annotations=backward_annotations,
backward_annotations=forward_annotations,
)
return output_str
230 changes: 171 additions & 59 deletions text_normalizer/factory/test/test_strip.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,184 @@
# -*- coding: utf-8 -*-
from unittest import TestCase
from ..strip import Strip


class StripTestCase(TestCase):
class StripTemplate:

def test_normalize(self):
for i in range(len(self.test_cases)):
test_case = self.test_cases[i]
with self.subTest(i=i):
result = self.normalizer.normalize(
test_case['input'],
)
self.assertEqual(
test_case['output'],
result[0],
)
self.assertEqual(
test_case['meta'],
result[1],
)

def test_invertible(self):
for i in range(len(self.test_cases)):
test_case = self.test_cases[i]
with self.subTest(i=test_case):
result = self.normalizer.normalize(
test_case['input'],
)
output = self.normalizer.denormalize(
result[0],
result[1],
)
self.assertEqual(
test_case['input'],
output,
)

def setUp(self):
self.strip_text_normalizer_default = Strip()
self.strip_text_normalizer_left = Strip(
direction='left',
chars=['#', ' '],
)
self.strip_text_normalizer_right = Strip(
direction='right',
chars=['/', ' '],
)

def test_attributes(self):
self.assertEqual(
class StripDefaultTestCase(StripTemplate, TestCase):

@classmethod
def setUpClass(cls):
cls.normalizer = Strip()
cls.test_cases = [
{
'chars': None,
'chars_str': None,
'direction': 'both',
'denormalizable': False,
'name': 'strip_both_None',
'input': ' \n\t\n HAHA\t \t \n \n ',
'output': 'HAHA',
'meta': {
'forward': [(0, 5, ' \n\t\n '), (9, 17, '\t \t \n \n ')],
'backward': [(0, 0, ''), (4, 4, '')],
},
},
self.strip_text_normalizer_default.__dict__,
)
self.assertEqual(
{
'chars': ['#', ' '],
'chars_str': '# ',
'direction': 'left',
'denormalizable': False,
'name': 'strip_left_# ',
'input': ' \n \t \t \n',
'output': '',
'meta': {
'forward': [(0, 8, ' \n \t \t \n')],
'backward': [(0, 0, '')],
},
},
self.strip_text_normalizer_left.__dict__,
)
self.assertEqual(
{
'chars': ['/', ' '],
'chars_str': '/ ',
'direction': 'right',
'denormalizable': False,
'name': 'strip_right_/ ',
'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
'output': '黃金曼特寧好苦QAQ',
'meta': {
'forward': [(10, 15, '\t\t\n\n ')],
'backward': [(10, 10, '')],
},
},
self.strip_text_normalizer_right.__dict__,
)
{
'input': '\t\t \n\n我的空白在前面ㄏㄏ',
'output': '我的空白在前面ㄏㄏ',
'meta': {
'forward': [(0, 7, '\t\t \n\n')],
'backward': [(0, 0, '')],
},
},
{
'input': '隼興大大是專業HR',
'output': '隼興大大是專業HR',
'meta': {
'forward': [],
'backward': [],
},
},
]

def test_normalize(self):
result = self.strip_text_normalizer_default.normalize(
sentence=' HAHA ',
)
self.assertEqual(
('HAHA', None),
result,
)
result = self.strip_text_normalizer_left.normalize(
sentence='## \t\tHAHA',
)
self.assertEqual(
('\t\tHAHA', None),
result,
)
result = self.strip_text_normalizer_right.normalize(
sentence='HAHA\t\t/// ',
)
self.assertEqual(
('HAHA\t\t', None),
result,
)

class StripLeftTestCase(StripTemplate, TestCase):

@classmethod
def setUpClass(cls):
cls.normalizer = Strip(direction='left')
cls.test_cases = [
{
'input': ' \n\t\n HAHA\t \t \n \n ',
'output': 'HAHA\t \t \n \n ',
'meta': {
'forward': [(0, 5, ' \n\t\n ')],
'backward': [(0, 0, '')],
},
},
{
'input': ' \n \t \t \n',
'output': '',
'meta': {
'forward': [(0, 8, ' \n \t \t \n')],
'backward': [(0, 0, '')],
},
},
{
'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
'output': '黃金曼特寧好苦QAQ\t\t\n\n ',
'meta': {
'forward': [],
'backward': [],
},
},
{
'input': '\t\t \n\n我的空白在前面ㄏㄏ',
'output': '我的空白在前面ㄏㄏ',
'meta': {
'forward': [(0, 7, '\t\t \n\n')],
'backward': [(0, 0, '')],
},
},
{
'input': '隼興大大是專業HR',
'output': '隼興大大是專業HR',
'meta': {
'forward': [],
'backward': [],
},
},
]


class StripRightTestCase(StripTemplate, TestCase):

@classmethod
def setUpClass(cls):
cls.normalizer = Strip(direction='right')
cls.test_cases = [
{
'input': ' \n\t\n HAHA\t \t \n \n ',
'output': ' \n\t\n HAHA',
'meta': {
'forward': [(9, 17, '\t \t \n \n ')],
'backward': [(9, 9, '')],
},
},
{
'input': ' \n \t \t \n',
'output': '',
'meta': {
'forward': [(0, 8, ' \n \t \t \n')],
'backward': [(0, 0, '')],
},
},
{
'input': '黃金曼特寧好苦QAQ\t\t\n\n ',
'output': '黃金曼特寧好苦QAQ',
'meta': {
'forward': [(10, 15, '\t\t\n\n ')],
'backward': [(10, 10, '')],
},
},
{
'input': '\t\t \n\n我的空白在前面ㄏㄏ',
'output': '\t\t \n\n我的空白在前面ㄏㄏ',
'meta': {
'forward': [],
'backward': [],
},
},
{
'input': '隼興大大是專業HR',
'output': '隼興大大是專業HR',
'meta': {
'forward': [],
'backward': [],
},
},
]