PY3 some work towards text_de

hayd · hayd · commit 3b9e5d6dcc74 · 2014-12-09T17:59:55.000-08:00
diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
@@ -399,10 +399,11 @@ def _read(path, encoding="utf-8", comment=";;;"):
             # From file or buffer.
             f = path
         for i, line in enumerate(f):
-            line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(
-                line, str) else line
+            line = (line.strip(codecs.BOM_UTF8)
+                    if i == 0 and isinstance(line, bytes)
+                    else line)
             line = line.strip()
-            line = decode_utf8(line, encoding)
+            line = line.decode(encoding)
             if not line or (comment and line.startswith(comment)):
                 continue
             yield line
@@ -424,6 +425,7 @@ def load(self):
         # Arnold NNP x
         dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))
 
+
 #--- FREQUENCY -----------------------------------------------------------
 
 
@@ -859,7 +861,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
             The given default tags are used for unknown words.
             Unknown words that start with a capital letter are tagged NNP (except for German).
             Unknown words that contain only digits and punctuation are tagged CD.
-            Optionally, morphological and contextual rules (or a language model) can be used 
+            Optionally, morphological and contextual rules (or a language model) can be used
             to improve the tags of unknown words.
             The given language can be used to discern between
             Germanic and Romance languages for phrase chunking.
@@ -1727,7 +1729,7 @@ def commandline(parse=Parser().parse):
         # The output can be either slash-formatted string or XML.
         if "xml" in arguments:
             s = Tree(s, s.tags).xml
-        print(encode_utf8(s))
+        print(s)
 
 #### VERBS ###############################################################
 
diff --git a/pattern/text/tree.py b/pattern/text/tree.py
@@ -88,7 +88,7 @@ def unique(iterable):
 
 
 def zip(*args, **kwargs):
-    """ Returns a list of tuples, where the i-th tuple contains the i-th element 
+    """ Returns a list of tuples, where the i-th tuple contains the i-th element
         from each of the argument sequences or iterables (or default if too short).
     """
     args = [list(iterable) for iterable in args]
@@ -810,13 +810,13 @@ def append(self, word, lemma=None, type=None, chunk=None, role=None, relation=No
     def parse_token(self, token, tags=[WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA]):
         """ Returns the arguments for Sentence.append() from a tagged token representation.
             The order in which token tags appear can be specified.
-            The default order is (separated by slashes): 
-            - word, 
-            - part-of-speech, 
-            - (IOB-)chunk, 
-            - (IOB-)preposition, 
-            - chunk(-relation)(-role), 
-            - anchor, 
+            The default order is (separated by slashes):
+            - word,
+            - part-of-speech,
+            - (IOB-)chunk,
+            - (IOB-)preposition,
+            - chunk(-relation)(-role),
+            - anchor,
             - lemma.
             Examples:
             The/DT/B-NP/O/NP-SBJ-1/O/the
@@ -1079,7 +1079,7 @@ def get(self, index, tag=LEMMA):
 
     def loop(self, *tags):
         """ Iterates over the tags in the entire Sentence,
-            For example, Sentence.loop(POS, LEMMA) yields tuples of the part-of-speech tags and lemmata. 
+            For example, Sentence.loop(POS, LEMMA) yields tuples of the part-of-speech tags and lemmata.
             Possible tags: WORD, LEMMA, POS, CHUNK, PNP, RELATION, ROLE, ANCHOR or a custom word tag.
             Any order or combination of tags can be supplied.
         """
@@ -1339,7 +1339,12 @@ def xml(self):
         xml.append("<%s>" % XML_TEXT)
         xml.extend([sentence.xml for sentence in self])
         xml.append("</%s>" % XML_TEXT)
-        return "\n".join(xml)
+        xml_ = "\n".join(xml)
+        try:
+            xml_.encode("utf-8")
+        except AttributeError: # TODO remove this hack
+            pass
+        return xml_
 
     @classmethod
     def from_xml(cls, xml):
diff --git a/test/test_de.py b/test/test_de.py
@@ -213,7 +213,7 @@ def test_parse(self):
         # 3) Assert the accuracy of the German tagger.
         i, n = 0, 0
         for sentence in open(os.path.join(PATH, "corpora", "tagged-de-tiger.txt")).readlines():
-            sentence = sentence.decode("utf-8").strip()
+            sentence = sentence.strip()
             s1 = [w.split("/") for w in sentence.split(" ")]
             s1 = [de.stts2penntreebank(w, pos) for w, pos in s1]
             s2 = [[w for w, pos in s1]]
@@ -239,13 +239,14 @@ def test_command_line(self):
 
         # Assert parsed output from the command-line (example from the
         # documentation).
-        p = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
-        p = subprocess.Popen(p, stdout=subprocess.PIPE)
+        command = ["python", "-m", "pattern.de", "-s", "Der grosse Hund.", "-OTCRL"]
+        p = subprocess.Popen(command, stdout=subprocess.PIPE)
         p.wait()
         v = p.stdout.read()
         v = v.strip()
         self.assertEqual(
-            v, "Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
+            v,
+            b"Der/DT/B-NP/O/O/der grosse/JJ/I-NP/O/O/gross Hund/NN/I-NP/O/O/hund ././O/O/O/.")
         print("python -m pattern.de")
 
 #-------------------------------------------------------------------------