Skip to content
This repository was archived by the owner on Sep 13, 2022. It is now read-only.

Commit 61aa557

Browse files
committed
Remove hacky fix for single word sentences
- The underlying bug has been fixed in ZPar so we don't need this hacky fix anymore. - Remove the hacky fix test from the test files too.
1 parent 718d72a commit 61aa557

File tree

6 files changed

+8
-120
lines changed

6 files changed

+8
-120
lines changed

tests/test_depparser.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,6 @@ def test_dep_parse_sentence():
3232
yield check_dep_parse_sentence, True
3333

3434

35-
def test_zpar_bugfix_depparse():
36-
from tests import depparser
37-
38-
sentences = ['REBELLION',
39-
'I am going away .',
40-
'The rebellion is just another word for change and change is necessary to live .',
41-
'REBELLION',
42-
'REBELLION',
43-
'The rebellion is just another word for change and change is necessary to live .',
44-
'REBELLION',
45-
'This is just another sentence .',
46-
'REBELLION']
47-
48-
# tag the above sentences
49-
parsed_sentences = [depparser.dep_parse_sentence(s) for s in sentences]
50-
51-
# get the parses for all of the all-caps single-word sentences
52-
# and make sure they are all the same
53-
indices_to_check = [0, 3, 4, 6, 8]
54-
parses_to_check = [parsed_sentences[i] for i in indices_to_check]
55-
assert_equal(set(parses_to_check), {'REBELLION\tNNP\t-1\tROOT\n'})
56-
57-
5835
def check_dep_parse_file(tokenize=False):
5936
"""
6037
Check parse_file method with and without tokenization

tests/test_parser.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,6 @@ def test_parse_sentence():
3333
yield check_parse_sentence, True
3434

3535

36-
def test_zpar_bugfix_parse():
37-
from tests import parser
38-
39-
sentences = ['REBELLION',
40-
'I am going away .',
41-
'The rebellion is just another word for change and change is necessary to live .',
42-
'REBELLION',
43-
'REBELLION',
44-
'The rebellion is just another word for change and change is necessary to live .',
45-
'REBELLION',
46-
'This is just another sentence .',
47-
'REBELLION']
48-
49-
# tag the above sentences
50-
parsed_sentences = [parser.parse_sentence(s) for s in sentences]
51-
52-
# get the parses for all of the all-caps single-word sentences
53-
# and make sure they are all the same
54-
indices_to_check = [0, 3, 4, 6, 8]
55-
parses_to_check = [parsed_sentences[i] for i in indices_to_check]
56-
assert_equal(set(parses_to_check), {'(NP (NNP REBELLION))'})
57-
58-
5936
def check_parse_file(tokenize=False):
6037
"""
6138
Check parse_file method with and without tokenization

tests/test_tagger.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,28 +33,6 @@ def test_tag_sentence():
3333
yield check_tag_sentence, True
3434

3535

36-
def test_zpar_bugfix_tags():
37-
from tests import tagger
38-
39-
sentences = ['REBELLION',
40-
'I am going away .',
41-
'The rebellion is just another word for change and change is necessary to live .',
42-
'REBELLION',
43-
'REBELLION',
44-
'The rebellion is just another word for change and change is necessary to live .',
45-
'REBELLION',
46-
'This is just another sentence .',
47-
'REBELLION']
48-
49-
# tag the above sentences
50-
tagged_sentences = [tagger.tag_sentence(s) for s in sentences]
51-
52-
# get the tags for all of the all-caps single-word sentences
53-
# and make sure they are all NNP
54-
indices_to_check = [0, 3, 4, 6, 8]
55-
tags_to_check = [tagged_sentences[i].split('/')[1] for i in indices_to_check]
56-
assert_equal(set(tags_to_check), {'NNP'})
57-
5836
def check_tag_file(tokenize=False):
5937
"""
6038
Check tag_file method with and without tokenization

zpar/DepParser.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import ctypes as c
88
import logging
99
import os
10-
import re
1110

1211
# set up the logging
1312
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -55,25 +54,12 @@ def dep_parse_sentence(self, sentence, tokenize=True):
5554
# return empty string if the input is empty
5655
ans = ""
5756
else:
58-
zpar_compatible_sentence = sentence
59-
all_caps_word = ''
60-
# detect if we are processing a sentence with a single word in all caps
61-
# because that is a known bug. This is a hack for now and will be removed
62-
# once the underlying bug is fixed in ZPar.
63-
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
64-
if m:
65-
all_caps_word = m.group(1)
66-
fixed_word = all_caps_word.title()
67-
self.logger.warning('Encountered sentence with all caps single word '
68-
'which triggers a known bug in ZPar. Title-casing '
69-
'to avoid buggy behavior.')
70-
zpar_compatible_sentence = sentence.title()
57+
zpar_compatible_sentence = sentence.strip() + "\n "
7158
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
7259
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
7360
parsed_sent = self._dep_parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
74-
# replace the title-cased word with the original all-caps word if we need to
75-
parsed_sent = parsed_sent.decode('utf-8')
76-
ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
61+
ans = parsed_sent.decode('utf-8')
62+
7763
return ans
7864

7965
def dep_parse_file(self, inputfile, outputfile, tokenize=True):

zpar/Parser.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import ctypes as c
88
import logging
99
import os
10-
import re
1110

1211
# set up the logging
1312
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -54,25 +53,11 @@ def parse_sentence(self, sentence, tokenize=True):
5453
# return empty string if the input is empty
5554
ans = ""
5655
else:
57-
zpar_compatible_sentence = sentence
58-
all_caps_word = ''
59-
# detect if we are processing a sentence with a single word in all caps
60-
# because that is a known bug. This is a hack for now and will be removed
61-
# once the underlying bug is fixed in ZPar.
62-
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
63-
if m:
64-
all_caps_word = m.group(1)
65-
fixed_word = all_caps_word.title()
66-
self.logger.warning('Encountered sentence with all caps single word '
67-
'which triggers a known bug in ZPar. Title-casing '
68-
'to avoid buggy behavior.')
69-
zpar_compatible_sentence = sentence.title()
56+
zpar_compatible_sentence = sentence.strip() + "\n "
7057
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
7158
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
7259
parsed_sent = self._parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
73-
# replace the title-cased word with the original all-caps word if we need to
74-
parsed_sent = parsed_sent.decode('utf-8')
75-
ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
60+
ans = parsed_sent.decode('utf-8')
7661

7762
return ans
7863

zpar/Tagger.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import ctypes as c
77
import logging
88
import os
9-
import re
109

1110
# set up the logging
1211
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -46,25 +45,11 @@ def tag_sentence(self, sentence, tokenize=True):
4645
# return empty string if the input is empty
4746
ans = ""
4847
else:
49-
zpar_compatible_sentence = sentence
50-
all_caps_word = ''
51-
# detect if we are processing a sentence with a single word in all caps
52-
# because that is a known bug. This is a hack for now and will be removed
53-
# once the underlying bug is fixed in ZPar.
54-
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
55-
if m:
56-
all_caps_word = m.group(1)
57-
fixed_word = all_caps_word.title()
58-
self.logger.warning('Encountered sentence with all caps single word '
59-
'which triggers a known bug in ZPar. Title-casing '
60-
'to avoid buggy behavior.')
61-
zpar_compatible_sentence = sentence.title()
62-
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
48+
zpar_compatible_sentence = sentence.strip() + "\n "
6349
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
6450
tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
65-
# replace the title-cased word with the original all-caps word if we need to
66-
tagged_sent = tagged_sent.decode('utf-8')
67-
ans = tagged_sent if not all_caps_word else tagged_sent.replace(fixed_word, all_caps_word)
51+
ans = tagged_sent.decode('utf-8')
52+
return ans
6853

6954
return ans
7055

0 commit comments

Comments
 (0)