Skip to content
This repository was archived by the owner on Sep 13, 2022. It is now read-only.

Commit 68a479e

Browse files
committed
Adding a hack to handle a ZPar bug
- ZPar has a bug where it produces non-deterministic output for sentences that contain a single word in all caps. This hack title-cases such words to make the output deterministic. This will be removed once the underlying bug in ZPar is fixed which is under progress.
1 parent 79e200c commit 68a479e

File tree

3 files changed

+78
-7
lines changed

3 files changed

+78
-7
lines changed

zpar/DepParser.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
'''
66

77
import ctypes as c
8+
import logging
89
import os
10+
import re
11+
12+
# set up the logging
13+
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
14+
915

1016
class DepParser(object):
1117
"""The ZPar English Dependency Parser"""
@@ -16,6 +22,9 @@ def __init__(self, modelpath, libptr, zpar_session_obj):
1622
# save the zpar session object
1723
self._zpar_session_obj = zpar_session_obj
1824

25+
# set up a logger
26+
self.logger = logging.getLogger(__name__)
27+
1928
# get the library method that loads the parser models
2029
self._load_depparser = libptr.load_depparser
2130
self._load_depparser.restype = c.c_int
@@ -46,11 +55,25 @@ def dep_parse_sentence(self, sentence, tokenize=True):
4655
# return empty string if the input is empty
4756
ans = ""
4857
else:
49-
zpar_compatible_sentence = sentence.strip() + "\n "
58+
zpar_compatible_sentence = sentence
59+
all_caps_word = ''
60+
# detect if we are processing a sentence with a single word in all caps
61+
# because that is a known bug. This is a hack for now and will be removed
62+
# once the underlying bug is fixed in ZPar.
63+
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
64+
if m:
65+
all_caps_word = m.group(1)
66+
fixed_word = all_caps_word.title()
67+
self.logger.warning('Encountered sentence with all caps single word '
68+
'which triggers a known bug in ZPar. Title-casing '
69+
'to avoid buggy behavior.')
70+
zpar_compatible_sentence = sentence.title()
71+
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
5072
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
5173
parsed_sent = self._dep_parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
52-
ans = parsed_sent.decode('utf-8')
53-
74+
# replace the title-cased word with the original all-caps word if we need to
75+
parsed_sent = parsed_sent.decode('utf-8')
76+
ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
5477
return ans
5578

5679
def dep_parse_file(self, inputfile, outputfile, tokenize=True):

zpar/Parser.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55
'''
66

77
import ctypes as c
8+
import logging
89
import os
10+
import re
11+
12+
# set up the logging
13+
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
914

1015
class Parser(object):
1116
"""The ZPar English Constituency Parser"""
@@ -16,6 +21,9 @@ def __init__(self, modelpath, libptr, zpar_session_obj):
1621
# save the zpar session object
1722
self._zpar_session_obj = zpar_session_obj
1823

24+
# set up a logger
25+
self.logger = logging.getLogger(__name__)
26+
1927
# get the library method that loads the parser models
2028
self._load_parser = libptr.load_parser
2129
self._load_parser.restype = c.c_int
@@ -46,10 +54,26 @@ def parse_sentence(self, sentence, tokenize=True):
4654
# return empty string if the input is empty
4755
ans = ""
4856
else:
49-
zpar_compatible_sentence = sentence.strip() + "\n "
57+
zpar_compatible_sentence = sentence
58+
all_caps_word = ''
59+
# detect if we are processing a sentence with a single word in all caps
60+
# because that is a known bug. This is a hack for now and will be removed
61+
# once the underlying bug is fixed in ZPar.
62+
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
63+
if m:
64+
all_caps_word = m.group(1)
65+
fixed_word = all_caps_word.title()
66+
self.logger.warning('Encountered sentence with all caps single word '
67+
'which triggers a known bug in ZPar. Title-casing '
68+
'to avoid buggy behavior.')
69+
zpar_compatible_sentence = sentence.title()
70+
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
5071
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
5172
parsed_sent = self._parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
52-
ans = parsed_sent.decode('utf-8')
73+
# replace the title-cased word with the original all-caps word if we need to
74+
parsed_sent = parsed_sent.decode('utf-8')
75+
ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
76+
5377
return ans
5478

5579
def parse_file(self, inputfile, outputfile, tokenize=True):

zpar/Tagger.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@
44
:organization: ETS
55
'''
66
import ctypes as c
7+
import logging
78
import os
9+
import re
10+
11+
# set up the logging
12+
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
13+
814

915
class Tagger(object):
1016
"""The ZPar English POS Tagger"""
@@ -15,6 +21,9 @@ def __init__(self, modelpath, libptr, zpar_session_obj):
1521
# save the zpar session object
1622
self._zpar_session_obj = zpar_session_obj
1723

24+
# set up a logger
25+
self.logger = logging.getLogger(__name__)
26+
1827
# get the library method that loads the tagger models
1928
self._load_tagger = libptr.load_tagger
2029
self._load_tagger.restype = c.c_int
@@ -37,10 +46,25 @@ def tag_sentence(self, sentence, tokenize=True):
3746
# return empty string if the input is empty
3847
ans = ""
3948
else:
40-
zpar_compatible_sentence = sentence.strip() + "\n "
49+
zpar_compatible_sentence = sentence
50+
all_caps_word = ''
51+
# detect if we are processing a sentence with a single word in all caps
52+
# because that is a known bug. This is a hack for now and will be removed
53+
# once the underlying bug is fixed in ZPar.
54+
m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
55+
if m:
56+
all_caps_word = m.group(1)
57+
fixed_word = all_caps_word.title()
58+
self.logger.warning('Encountered sentence with all caps single word '
59+
'which triggers a known bug in ZPar. Title-casing '
60+
'to avoid buggy behavior.')
61+
zpar_compatible_sentence = sentence.title()
62+
zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
4163
zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
4264
tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
43-
ans = tagged_sent.decode('utf-8')
65+
# replace the title-cased word with the original all-caps word if we need to
66+
tagged_sent = tagged_sent.decode('utf-8')
67+
ans = tagged_sent if not all_caps_word else tagged_sent.replace(fixed_word, all_caps_word)
4468

4569
return ans
4670

0 commit comments

Comments
 (0)