Change the _idx_to_token list to a dictionary (#617)

BFJL · smallv0221 · guoshengCS · web-flow · commit f142f065a060 · 2021-06-25T21:44:45.000+08:00
Co-authored-by: smallv0221 &lt;33639025+smallv0221@users.noreply.github.com&gt;
Co-authored-by: Guo Sheng &lt;whucsgs@163.com&gt;
diff --git a/paddlenlp/data/vocab.py b/paddlenlp/data/vocab.py
@@ -92,18 +92,16 @@ def __init__(self,
                 assert special_token in token_to_idx, '{} is not in token_to_idx'.format(
                     special_token)
             self._token_to_idx = token_to_idx
-            self._idx_to_token = sorted(
-                self._token_to_idx.keys(),
-                key=lambda token: self._token_to_idx[token])
+            self._idx_to_token = {idx: token for token, idx in token_to_idx.items()}
             if unk_token:
                 unk_index = self._token_to_idx[unk_token]
                 self._token_to_idx = collections.defaultdict(lambda: unk_index)
                 self._token_to_idx.update(token_to_idx)
         else:
-            self._idx_to_token = list(special_tokens)
+            self._idx_to_token = {idx: special_token for idx, special_token in enumerate(special_tokens)}
             self._token_to_idx = collections.defaultdict()
             self._token_to_idx.update(
-                (token, idx) for idx, token in enumerate(self._idx_to_token))
+                (token, idx) for idx, token in self._idx_to_token.items())
             self._index_counter_keys(counter, special_tokens, max_size,
                                      min_freq)
             if token_to_idx:
@@ -138,8 +136,8 @@ def _index_counter_keys(self, counter, special_tokens, max_size, min_freq):
             if freq < min_freq or len(self._idx_to_token) == max_size:
                 break
             if token not in special_tokens:
-                self._idx_to_token.append(token)
-                self._token_to_idx[token] = len(self._idx_to_token) - 1
+                self._idx_to_token[max(list(self._idx_to_token.keys()) + [-1]) + 1] = token
+                self._token_to_idx[token] = max(self._idx_to_token.keys())
 
     def _sort_index_according_to_user_specification(self, token_to_idx):
         # Sanity checks
@@ -208,7 +206,7 @@ def to_tokens(self, indices):
                 'Token indices is invalid. Expected 1D array, but received {}D array. '.
                 format(len(indices.shape)))
 
-        max_idx = len(self._idx_to_token) - 1
+        max_idx = max(self._idx_to_token.keys())
 
         tokens = []
         for idx in indices:
@@ -316,7 +314,7 @@ def to_json(self, path=None):
                 json_str = vocab.to_json(path='./vocab.json')
         """
         vocab_dict = {}
-        vocab_dict['idx_to_token'] = self.idx_to_token
+        vocab_dict['idx_to_token'] = dict(self.idx_to_token)
         vocab_dict['token_to_idx'] = dict(self.token_to_idx)
         vocab_dict['unk_token'] = self.unk_token
         vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens