diff --git a/bpe.vocab b/bpe.vocab new file mode 100644 index 0000000..b6058ba --- /dev/null +++ b/bpe.vocab @@ -0,0 +1,19 @@ +#version: 0.2 +d e +c o +e n +co de +b u +w i +w h +u b +r o +o k +ok en +o f +l a +i s +en code +e x +c h +T h diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index ca0a758..b11f8d1 100644 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -27,6 +27,7 @@ pip install -U -r requirements.txt --progress-bar off pip install spacy --progress-bar off pip install nltk --progress-bar off pip install sacremoses --progress-bar off +pip install subword_nmt --progress-bar off pip install pandas --progress-bar off pip install requests --progress-bar off diff --git a/requirements.txt b/requirements.txt index bf644a7..a6ad3db 100755 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,7 @@ mock # nltk # spacy # sacremoses +# subword-nmt # Optional CUDA Utilties # pynvrtc diff --git a/test_bpe.vocab b/test_bpe.vocab new file mode 100644 index 0000000..0c2f76d --- /dev/null +++ b/test_bpe.vocab @@ -0,0 +1,23 @@ +#version: 0.2 +h a +s ; +s; t +p o +po s;t +p a +pa j +paj a +paja m +pajam a +pajama s +o w +o u +o t +n g +m y +l e +le p +i n +b e +a pos;t +& apos;t diff --git a/tests/encoders/text/test_bytepair_encoder.py b/tests/encoders/text/test_bytepair_encoder.py new file mode 100644 index 0000000..58a6c9c --- /dev/null +++ b/tests/encoders/text/test_bytepair_encoder.py @@ -0,0 +1,63 @@ +import unittest +import torch +import sys +from torchnlp.encoders.text import BPEEncoder + + +class TestBPETextTokenizer(unittest.TestCase): + + def setUp(self): + self.corpus = ['This is a corpus of text that provides a bunch of tokens from which ', + 'to build a vocabulary. It will be used when strings are encoded ', + 'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.'] + + def test_vocab(self): + encoder = BPEEncoder(self.corpus, from_filenames=False) + + # test if reserved_tokens were add to index_to_token. + self.assertEqual('', encoder.vocab[0]) + self.assertEqual('', encoder.vocab[1]) + self.assertEqual('', encoder.vocab[2]) + self.assertEqual('', encoder.vocab[3]) + self.assertEqual('', encoder.vocab[4]) + + # test if some high occurrence sub words are in the token. + self.assertIn('oken@@', encoder.index_to_token) + self.assertIn('encode@@', encoder.index_to_token) + + expect_vocab_size = 57 + self.assertEqual(expect_vocab_size, encoder.vocab_size) + + def test_encode(self): + if sys.version_info.minor > 5: + original = 'This is a coded sentence encoded by the SubwordTextTokenizer.' + encoder = BPEEncoder(self.corpus, from_filenames=False) + + # excepted encode. + expect = [5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32, 41, 36, 14, 17, + 42, 49, 50, 51, 33, 9, 52, 53, 15, 14, 53, 26, 21, 54, 44, 55, 37] + + encode_lst = encoder.encode(original).numpy().tolist() + + self.assertListEqual(expect, encode_lst) + + def test_decoder(self): + if sys.version_info.minor > 5: + encoded = torch.tensor([5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32, + 41, 36, 14, 17, 42, 49, 50, 51, 33, 9, 52, 53, 15, + 14, 53, 26, 21, 54, 44, 55, 37]) + + encoder = BPEEncoder(self.corpus, from_filenames=False) + + expect = "This is a coded s t ce encoded by the SubwordTextTokenizer." + + self.assertEqual(expect, encoder.decode(encoded)) + + def test_encode_decode(self): + original = "This is a coded sentence encoded by the SubwordTextTokenizer." + expect = "This is a coded s t ce encoded by the SubwordTextTokenizer." + + encoder = BPEEncoder(self.corpus, from_filenames=False) + + decode_encode_str = encoder.decode(encoder.encode(original)) + self.assertEqual(expect, decode_encode_str) diff --git a/tests/encoders/text/test_bytepair_tokenizer.py b/tests/encoders/text/test_bytepair_tokenizer.py new file mode 100644 index 0000000..7ac5c90 --- /dev/null +++ b/tests/encoders/text/test_bytepair_tokenizer.py @@ -0,0 +1,101 @@ +import unittest +import pickle + +from torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer + + +class TestBPETextTokenizer(unittest.TestCase): + + def setUp(self): + self.corpus = [ + "One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't", + 'know.', 'Groucho Marx', + "I haven't slept for 10 days... because that would be too long.", 'Mitch Hedberg' + ] + + def test_pre_tokenizer(self): + expected = ['One morning I shot an elephant in my pajamas . How he got in my pajamas ,' + ' I don 't', + 'know .', + 'Groucho Marx', + 'I haven 't slept for 10 days ... because that would be too long .', + 'Mitch Hedberg'] + + self.assertListEqual(expected, [BPETextTokenizer.pre_tokenize(sen) for sen in self.corpus]) + + def test_get_vocabulary(self): + # tokenizer = BPETextTokenizer('test_bpe', use_moses=True) + def segment_words(line): + return BPETextTokenizer._segment_words(line, BPETextTokenizer.pre_tokenize) + token_counts = BPETextTokenizer.get_vocabulary(self.corpus, + segment_words, from_filenames=False) + expected = { + "'t": 2, + ".": 3, + "...": 1, + "Groucho": 1, + "Marx": 1, + "Mitch": 1, + "Hedberg": 1, + "I": 3, + "in": 2, + "my": 2, + "know": 1, + "because": 1, + "pajamas": 2, + } + self.assertDictContainsSubset(expected, token_counts) + + def test_learn_bpe(self): + tokenizer = BPETextTokenizer('test_bpe') + tokenizer.build_from_corpus(self.corpus, from_filenames=False) + expected = {('&', 'apos;t'): 21, ('a', 'pos;t'): 20, ('b', 'e'): 19, + ('i', 'n'): 18, ('le', 'p'): 17, ('l', 'e'): 16, ('m', 'y'): 15, + ('n', 'g'): 14, ('o', 't'): 13, ('o', 'u'): 12, ('o', 'w'): 11, + ('pajama', 's'): 10, ('pajam', 'a'): 9, ('paja', 'm'): 8, ('paj', 'a'): 7, + ('pa', 'j'): 6, ('p', 'a'): 5, ('po', 's;t'): 4, ('p', 'o'): 3, + ('s;', 't'): 2, ('s', ';'): 1, ('h', 'a'): 0} + self.assertDictEqual(expected, tokenizer.bpe.bpe_codes) + + def test_encode_decode(self): + corpus = ['This is a corpus of text that provides a bunch of tokens from which ', + 'to build a vocabulary. It will be used when strings are encoded ', + 'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.'] + + original = 'This is a coded sentence encoded by the SubwordTextTokenizer.' + + tokenizer = BPETextTokenizer('test_bpe') + tokenizer.build_from_corpus(corpus, from_filenames=False) + + # Encoding should be reversible. + encoded = tokenizer.encode(original) + decoded = tokenizer.decode(encoded) + self.assertEqual(original, decoded) + + # The substrings coded@@ and en@@ are frequent enough in the corpus that + # they should appear in the vocabulary even though they are substrings + # of other included strings. + subtoken_strings = encoded + self.assertIn('en@@', subtoken_strings) + self.assertIn('code@@', subtoken_strings) + + def test_build_vocab(self): + tokenizer = BPETextTokenizer('test_bpe') + tokenizer.build_from_corpus(self.corpus, from_filenames=False) + + # test the all item in vocab. + expect = {'O@@': 1, 'n@@': 4, 'e': 4, 'm@@': 1, 'o@@': 5, 'r@@': 4, 'i@@': 2, + 'ng': 2, 'I': 3, 's@@': 3, 'h@@': 3, 'ot': 2, 'a@@': 4, 'n': 3, + 'e@@': 3, 'lep@@': 2, 'ha@@': 3, 't': 3, 'in': 2, 'my': 2, + 'pajamas': 2, '.': 4, 'H@@': 2, 'ow': 2, 'g@@': 1, ',': 1, 'd@@': 3, + ''t': 2, 'k@@': 1, 'G@@': 1, 'ou@@': 2, 'c@@': 3, 'o': 2, + 'M@@': 2, 'x': 1, 'v@@': 1, 'f@@': 1, 'r': 1, '1@@': 1, '0': 1, + 'y@@': 1, 's': 1, '.@@': 2, 'be@@': 2, 'u@@': 1, 't@@': 3, + 'w@@': 1, 'l@@': 2, 'd': 1, 'b@@': 1, 'h': 1, 'g': 1} + + self.assertDictEqual(expect, tokenizer.vocab) + + +def test_is_pickleable(): + tokenizer = BPETextTokenizer('test_bpe') + pickle.dumps(tokenizer) diff --git a/torchnlp/encoders/text/__init__.py b/torchnlp/encoders/text/__init__.py index 645314c..8f0fff1 100755 --- a/torchnlp/encoders/text/__init__.py +++ b/torchnlp/encoders/text/__init__.py @@ -21,6 +21,7 @@ from torchnlp.encoders.text.text_encoder import TextEncoder from torchnlp.encoders.text.treebank_encoder import TreebankEncoder from torchnlp.encoders.text.whitespace_encoder import WhitespaceEncoder +from torchnlp.encoders.text.bytepair_encoder import BPEEncoder __all__ = [ 'CharacterEncoder', 'DEFAULT_COPY_INDEX', 'DEFAULT_COPY_TOKEN', 'DEFAULT_EOS_INDEX', @@ -28,5 +29,6 @@ 'DEFAULT_RESERVED_TOKENS', 'DEFAULT_SOS_INDEX', 'DEFAULT_SOS_TOKEN', 'DEFAULT_UNKNOWN_INDEX', 'DEFAULT_UNKNOWN_TOKEN', 'DelimiterEncoder', 'MosesEncoder', 'pad_tensor', 'stack_and_pad_tensors', 'TextEncoder', 'SpacyEncoder', 'StaticTokenizerEncoder', - 'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences' + 'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences', + 'BPEEncoder' ] diff --git a/torchnlp/encoders/text/bpe_text_tokenizer.py b/torchnlp/encoders/text/bpe_text_tokenizer.py new file mode 100644 index 0000000..8463322 --- /dev/null +++ b/torchnlp/encoders/text/bpe_text_tokenizer.py @@ -0,0 +1,86 @@ +import codecs +from subword_nmt import learn_bpe, apply_bpe +from collections import Counter +from sacremoses import MosesTokenizer, MosesDetokenizer + + +class BPETextTokenizer(object): + _moses_tok = MosesTokenizer(lang='en') + _moses_detok = MosesDetokenizer(lang='en') + + def __init__(self, file_prefix=None, separator='@@'): + if file_prefix is not None: + self.codes_file = '{}.vocab'.format(file_prefix) + + self.separator = separator + self.bpe = None + self.vocab = None + + @staticmethod + def pre_tokenize(line): + return BPETextTokenizer._moses_tok.tokenize(line, return_str=True) + + @staticmethod + def _segment_words(line, pre_apply=None): + if pre_apply is not None: + line = pre_apply(line) + line = str(line) + return line.strip('\r\n ').split() + + @staticmethod + def get_vocabulary(item_list, segment=_segment_words, from_filenames=True): + vocab = Counter() + if from_filenames: + for fname in item_list: + with codecs.open(fname, encoding='UTF-8') as f: + for line in f: + for word in segment(line): + vocab[word] += 1 + else: + for line in item_list: + for word in segment(line): + vocab[word] += 1 + return vocab + + def build_from_corpus(self, item_list, min_count=2, num_symbols=10000, + total_symbols=False, from_filenames=True): + def segment_words(line): + return self._segment_words(line, self.pre_tokenize) + + vocab_words = self.get_vocabulary(item_list, segment_words, from_filenames=from_filenames) + + vocab_list = ['{0} {1}'.format(key, freq) + for (key, freq) in vocab_words.items()] + + with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output: + learn_bpe.learn_bpe(vocab_list, output, num_symbols=num_symbols, + min_frequency=min_count, verbose=False, + is_dict=True, total_symbols=total_symbols) + + with codecs.open(self.codes_file, encoding='UTF-8') as codes: + self.bpe = apply_bpe.BPE(codes, separator=self.separator) + + self.vocab = dict(self.get_vocabulary(item_list=item_list, segment=self.segment, + from_filenames=from_filenames)) + + def segment(self, line): + if not hasattr(self, 'bpe'): + raise NameError('Learn bpe first!') + line = self.pre_tokenize(line) + return self.bpe.segment(line.strip('\r\n ')).split(' ') + + def encode(self, raw_text): + return self.segment(raw_text) + + def decode(self, bpe_text, delimiter=' '): + decode_string = delimiter.join(bpe_text) + try: + decode_string = decode_string.decode('utf-8') + except Exception: + pass + decode_string = decode_string \ + .replace(self.separator + ' ', '') \ + .replace(self.separator, '') + decode_string = str(decode_string).strip('\r\n ').split() + decode_string = self._moses_detok.tokenize(decode_string) + return decode_string diff --git a/torchnlp/encoders/text/bytepair_encoder.py b/torchnlp/encoders/text/bytepair_encoder.py new file mode 100644 index 0000000..87d4311 --- /dev/null +++ b/torchnlp/encoders/text/bytepair_encoder.py @@ -0,0 +1,104 @@ +import torch + +from torchnlp.encoders.text import TextEncoder, DEFAULT_RESERVED_TOKENS, DEFAULT_EOS_INDEX, \ + DEFAULT_UNKNOWN_INDEX, DEFAULT_PADDING_INDEX +from torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer + + +class BPEEncoder(TextEncoder): + """ Encodes the text using byte pair encoding. + + **Tokenizer Reference:** + https://github.com/eladhoffer/seq2seq.pytorch/blob/master/seq2seq/tools/tokenizer.py + + Args: + item_list (list): list of data used to build encoding dictionary and BPE tokenizer. + If they come from files, ``from_filenames`` must be true, otherwise false. + append_eos (bool, optional): If ``True`` append EOS token onto the end to the encoded + vector. + min_occurrences (int, optional): Lower bound for the minimum token count. + num_symbols (int, optional): desired size of encoding dictionary + from_filenames (bool, optional): whether item_list refers to file names or not + reserved_tokens (list of str, optional): List of reserved tokens inserted in the beginning + of the dictionary. + eos_index (int, optional): The eos token is used to encode the end of a sequence. This is + the index that token resides at. + unknown_index (int, optional): The unknown token is used to encode unseen tokens. This is + the index that token resides at. + padding_index (int, optional): The padding token is used to encode sequence padding. This is + the index that token resides at. + **kwargs: Keyword arguments passed onto ``TextEncoder.__init__``. + """ + + def __init__(self, + item_list, + append_eos=False, + min_occurrences=2, + num_symbols=10000, + from_filenames=True, + reserved_tokens=DEFAULT_RESERVED_TOKENS, + eos_index=DEFAULT_EOS_INDEX, + unknown_index=DEFAULT_UNKNOWN_INDEX, + padding_index=DEFAULT_PADDING_INDEX, + **kwargs): + super().__init__(**kwargs) + + self.append_eos = append_eos + self.eos_index = eos_index + self.unknown_index = unknown_index + self.reserved_tokens = reserved_tokens + self.padding_index = padding_index + + self.tokenizer = BPETextTokenizer('./bpe') + self.tokenizer.build_from_corpus(item_list, min_count=min_occurrences, + num_symbols=num_symbols, from_filenames=from_filenames) + + self.index_to_token = reserved_tokens.copy() + self.token_to_index = {token: index for index, token in enumerate(reserved_tokens)} + for token in self.tokenizer.vocab: + self.index_to_token.append(token) + self.token_to_index[token] = len(self.index_to_token) - 1 + + @property + def vocab(self): + """ + Returns: + list: List of tokens in the dictionary. + """ + return self.index_to_token + + @property + def vocab_size(self): + """ + Returns: + int: Number of tokens in the dictionary. + """ + return len(self.vocab) + + def encode(self, sequence): + """ Encodes a ``sequence``. + + Args: + sequence (str): String ``sequence`` to encode. + + Returns: + torch.Tensor: Encoding of the ``sequence``. + """ + sequence = super().encode(sequence) + sequence = self.tokenizer.encode(sequence) + vector = [self.token_to_index.get(token, self.unknown_index) for token in sequence] + if self.append_eos: + vector.append(self.eos_index) + return torch.tensor(vector) + + def decode(self, encoded): + """ Decodes a tensor into a sequence. + + Args: + encoded (torch.Tensor): Encoded sequence. + + Returns: + str: Sequence decoded from ``encoded``. + """ + encoded = super().decode(encoded) + return self.tokenizer.decode([self.index_to_token[index] for index in encoded])