Remove hacky fix for single word sentences

desilinguist · desilinguist · commit 61aa557378b8 · 2015-07-16T09:28:50.000-04:00
- The underlying bug has been fixed in ZPar so we don't need this hacky fix anymore.
- Remove the hacky fix test from the test files too.
diff --git a/tests/test_depparser.py b/tests/test_depparser.py
@@ -32,29 +32,6 @@ def test_dep_parse_sentence():
     yield check_dep_parse_sentence, True
 
 
-def test_zpar_bugfix_depparse():
-    from tests import depparser
-
-    sentences = ['REBELLION',
-                 'I am going away .',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'REBELLION',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'This is just another sentence .',
-                 'REBELLION']
-
-    # tag the above sentences
-    parsed_sentences = [depparser.dep_parse_sentence(s) for s in sentences]
-
-    # get the parses for all of the all-caps single-word sentences
-    # and make sure they are all the same
-    indices_to_check = [0, 3, 4, 6, 8]
-    parses_to_check = [parsed_sentences[i] for i in indices_to_check]
-    assert_equal(set(parses_to_check), {'REBELLION\tNNP\t-1\tROOT\n'})
-
-
 def check_dep_parse_file(tokenize=False):
     """
     Check parse_file method with and without tokenization
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -33,29 +33,6 @@ def test_parse_sentence():
     yield check_parse_sentence, True
 
 
-def test_zpar_bugfix_parse():
-    from tests import parser
-
-    sentences = ['REBELLION',
-                 'I am going away .',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'REBELLION',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'This is just another sentence .',
-                 'REBELLION']
-
-    # tag the above sentences
-    parsed_sentences = [parser.parse_sentence(s) for s in sentences]
-
-    # get the parses for all of the all-caps single-word sentences
-    # and make sure they are all the same
-    indices_to_check = [0, 3, 4, 6, 8]
-    parses_to_check = [parsed_sentences[i] for i in indices_to_check]
-    assert_equal(set(parses_to_check), {'(NP (NNP REBELLION))'})
-
-
 def check_parse_file(tokenize=False):
     """
     Check parse_file method with and without tokenization
diff --git a/tests/test_tagger.py b/tests/test_tagger.py
@@ -33,28 +33,6 @@ def test_tag_sentence():
     yield check_tag_sentence, True
 
 
-def test_zpar_bugfix_tags():
-    from tests import tagger
-
-    sentences = ['REBELLION',
-                 'I am going away .',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'REBELLION',
-                 'The rebellion is just another word for change and change is necessary to live .',
-                 'REBELLION',
-                 'This is just another sentence .',
-                 'REBELLION']
-
-    # tag the above sentences
-    tagged_sentences = [tagger.tag_sentence(s) for s in sentences]
-
-    # get the tags for all of the all-caps single-word sentences
-    # and make sure they are all NNP
-    indices_to_check = [0, 3, 4, 6, 8]
-    tags_to_check = [tagged_sentences[i].split('/')[1] for i in indices_to_check]
-    assert_equal(set(tags_to_check), {'NNP'})
-
 def check_tag_file(tokenize=False):
     """
     Check tag_file method with and without tokenization
diff --git a/zpar/DepParser.py b/zpar/DepParser.py
@@ -7,7 +7,6 @@
 import ctypes as c
 import logging
 import os
-import re
 
 # set up the logging
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -55,25 +54,12 @@ def dep_parse_sentence(self, sentence, tokenize=True):
             # return empty string if the input is empty
             ans = ""
         else:
-            zpar_compatible_sentence = sentence
-            all_caps_word = ''
-            # detect if we are processing a sentence with a single word in all caps
-            # because that is a known bug. This is a hack for now and will be removed
-            # once the underlying bug is fixed in ZPar.
-            m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
-            if m:
-                all_caps_word = m.group(1)
-                fixed_word = all_caps_word.title()
-                self.logger.warning('Encountered sentence with all caps single word '
-                                    'which triggers a known bug in ZPar. Title-casing '
-                                    'to avoid buggy behavior.')
-                zpar_compatible_sentence = sentence.title()
+            zpar_compatible_sentence = sentence.strip() + "\n "
             zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
             parsed_sent = self._dep_parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
-            # replace the title-cased word with the original all-caps word if we need to
-            parsed_sent = parsed_sent.decode('utf-8')
-            ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
+            ans = parsed_sent.decode('utf-8')
+
         return ans
 
     def dep_parse_file(self, inputfile, outputfile, tokenize=True):
diff --git a/zpar/Parser.py b/zpar/Parser.py
@@ -7,7 +7,6 @@
 import ctypes as c
 import logging
 import os
-import re
 
 # set up the logging
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -54,25 +53,11 @@ def parse_sentence(self, sentence, tokenize=True):
             # return empty string if the input is empty
             ans = ""
         else:
-            zpar_compatible_sentence = sentence
-            all_caps_word = ''
-            # detect if we are processing a sentence with a single word in all caps
-            # because that is a known bug. This is a hack for now and will be removed
-            # once the underlying bug is fixed in ZPar.
-            m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
-            if m:
-                all_caps_word = m.group(1)
-                fixed_word = all_caps_word.title()
-                self.logger.warning('Encountered sentence with all caps single word '
-                                    'which triggers a known bug in ZPar. Title-casing '
-                                    'to avoid buggy behavior.')
-                zpar_compatible_sentence = sentence.title()
+            zpar_compatible_sentence = sentence.strip() + "\n "
             zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
             parsed_sent = self._parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
-            # replace the title-cased word with the original all-caps word if we need to
-            parsed_sent = parsed_sent.decode('utf-8')
-            ans = parsed_sent if not all_caps_word else parsed_sent.replace(fixed_word, all_caps_word)
+            ans = parsed_sent.decode('utf-8')
 
         return ans
 
diff --git a/zpar/Tagger.py b/zpar/Tagger.py
@@ -6,7 +6,6 @@
 import ctypes as c
 import logging
 import os
-import re
 
 # set up the logging
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
@@ -46,25 +45,11 @@ def tag_sentence(self, sentence, tokenize=True):
             # return empty string if the input is empty
             ans = ""
         else:
-            zpar_compatible_sentence = sentence
-            all_caps_word = ''
-            # detect if we are processing a sentence with a single word in all caps
-            # because that is a known bug. This is a hack for now and will be removed
-            # once the underlying bug is fixed in ZPar.
-            m = re.match(r'^([A-Z]+)$', zpar_compatible_sentence.strip())
-            if m:
-                all_caps_word = m.group(1)
-                fixed_word = all_caps_word.title()
-                self.logger.warning('Encountered sentence with all caps single word '
-                                    'which triggers a known bug in ZPar. Title-casing '
-                                    'to avoid buggy behavior.')
-                zpar_compatible_sentence = sentence.title()
-            zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
+            zpar_compatible_sentence = sentence.strip() + "\n "
             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
             tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
-            # replace the title-cased word with the original all-caps word if we need to
-            tagged_sent = tagged_sent.decode('utf-8')
-            ans = tagged_sent if not all_caps_word else tagged_sent.replace(fixed_word, all_caps_word)
+            ans = tagged_sent.decode('utf-8')
+            return ans
 
         return ans