Merge pull request #47 from todtom/patch-1

zsdonghao · web-flow · commit 10d73fa50de5 · 2016-12-28T16:41:26.000+08:00
Update nlp.py --&gt; Support chinese
diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py
@@ -744,6 +744,7 @@ def basic_tokenizer(sentence, _WORD_SPLIT=re.compile(b"([.,!?\"':;)(])")):
   - Code from ``/tensorflow/models/rnn/translation/data_utils.py``
   """
   words = []
+  sentence = tf.compat.as_bytes(sentence)
   for space_separated_fragment in sentence.strip().split():
     words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
   return [w for w in words if w]
@@ -840,7 +841,7 @@ def initialize_vocabulary(vocabulary_path):
     rev_vocab = []
     with gfile.GFile(vocabulary_path, mode="rb") as f:
       rev_vocab.extend(f.readlines())
-    rev_vocab = [line.strip() for line in rev_vocab]
+    rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab]
     vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
     return vocab, rev_vocab
   else: