Fix faster tokenizer multithreads bug (#3119)

joey12300 · web-flow · commit 2c294008b967 · 2022-08-23T19:12:47.000+08:00
* Add imdb hf datasets * Adjust thucnews load * upgrade faster_tokenizer * Revert "Adjust thucnews load" This reverts commit 677e33a. * Revert "Add imdb hf datasets" This reverts commit 72402f1. * Add CharToBytesOffsetConverter * Change return value of CharToBytesOffsetConverter * Fix some omp directive * fix some multithreads bug
diff --git a/faster_tokenizer/faster_tokenizer/include/pretokenizers/pretokenizer.h b/faster_tokenizer/faster_tokenizer/include/pretokenizers/pretokenizer.h
@@ -102,6 +102,13 @@ struct FASTERTOKENIZER_DECL BytesToCharOffsetConverter
   virtual bool convert(const core::Offset&, core::Offset*) const;
 };
 
+struct FASTERTOKENIZER_DECL CharToBytesOffsetConverter
+    : public OffsetConverter {
+  std::vector<size_t> offset_map_;
+  CharToBytesOffsetConverter(const std::string&);
+  virtual bool convert(const core::Offset&, core::Offset*) const;
+};
+
 }  // namespace pretokenizers
 }  // namespace faster_tokenizer
 }  // namespace paddlenlp
diff --git a/faster_tokenizer/faster_tokenizer/src/core/encoding.cc b/faster_tokenizer/faster_tokenizer/src/core/encoding.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include <sstream>
 #include "glog/logging.h"
 
+#ifdef WITH_OMP
+#include <omp.h>
+#endif
 namespace paddlenlp {
 namespace faster_tokenizer {
 namespace core {
@@ -627,7 +630,12 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
       pad_length % method.pad_to_multiple_of_) {
     pad_length += pad_length - pad_length % method.pad_to_multiple_of_;
   }
-  for (auto& encoding : *encodings) {
+  auto batch_size = encodings->size();
+#ifdef WITH_OMP
+#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
+#endif
+  for (int i = 0; i < batch_size; ++i) {
+    auto& encoding = (*encodings)[i];
     encoding.Pad(pad_length,
                  method.pad_id_,
                  method.pad_token_type_id_,
diff --git a/faster_tokenizer/faster_tokenizer/src/core/tokenizer.cc b/faster_tokenizer/faster_tokenizer/src/core/tokenizer.cc
@@ -252,17 +252,16 @@ void Tokenizer::EncodeBatchStrings(
     const std::vector<EncodeInput>& batch_encode_input,
     std::vector<Encoding>* encodings,
     bool add_special_tokens) const {
-  encodings->resize(batch_encode_input.size());
+  auto batch_size = batch_encode_input.size();
+  encodings->resize(batch_size);
 #ifdef WITH_OMP
 // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
 // tokenization.
 // Use workload to determine whether create omp threads. Need to optimize the
 // workload estimation.
-#pragma omp parallel for if (batch_encode_input.size() >= 4 &&               \
-                                                     omp_get_num_threads() > \
-                                                                         1)
+#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
 #endif
-  for (int i = 0; i < batch_encode_input.size(); ++i) {
+  for (int i = 0; i < batch_size; ++i) {
     EncodePairStrings(
         batch_encode_input[i], &(*encodings)[i], add_special_tokens);
   }
@@ -294,17 +293,16 @@ void Tokenizer::EncodeBatchStringsCharOffsets(
     const std::vector<EncodeInput>& batch_encode_input,
     std::vector<Encoding>* encodings,
     bool add_special_tokens) const {
-  encodings->resize(batch_encode_input.size());
+  auto batch_size = batch_encode_input.size();
+  encodings->resize(batch_size);
 #ifdef WITH_OMP
 // (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
 // tokenization.
 // Use workload to determine whether create omp threads. Need to optimize the
 // workload estimation.
-#pragma omp parallel for if (batch_encode_input.size() >= 4 &&               \
-                                                     omp_get_num_threads() > \
-                                                                         1)
+#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
 #endif
-  for (int i = 0; i < batch_encode_input.size(); ++i) {
+  for (int i = 0; i < batch_size; ++i) {
     Encoding encoding;
     EncodePairStringsCharOffsets(
         batch_encode_input[i], &encoding, add_special_tokens);
diff --git a/faster_tokenizer/faster_tokenizer/src/pretokenizers/pretokenizer.cc b/faster_tokenizer/faster_tokenizer/src/pretokenizers/pretokenizer.cc
@@ -55,6 +55,37 @@ bool BytesToCharOffsetConverter::convert(const core::Offset& offset,
   return true;
 }
 
+
+CharToBytesOffsetConverter::CharToBytesOffsetConverter(const std::string& seq)
+    : OffsetConverter(seq) {
+  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+  std::u32string u32seq = conv.from_bytes(seq);
+  uint32_t index = 0;
+  offset_map_.reserve(u32seq.length() * 4);
+  for (int i = 0; i < u32seq.length(); ++i) {
+    offset_map_.push_back(index);
+    auto utf8_len = faster_tokenizer::utils::GetUTF8CharLen(u32seq[i]);
+    index += utf8_len;
+  }
+  offset_map_.push_back(index);
+}
+
+bool CharToBytesOffsetConverter::convert(const core::Offset& offset,
+                                         core::Offset* result) const {
+  size_t char_start = offset.first;
+  size_t char_end = offset.second;
+  if (offset_map_.size() <= char_start) {
+    return false;
+  }
+  auto byte_start = offset_map_.at(char_start);
+  auto byte_end = byte_start + 1;
+  if (offset_map_.size() > char_end) {
+    byte_end = offset_map_.at(char_end);
+  }
+  *result = {byte_start, byte_end};
+  return true;
+}
+
 PreTokenizedString::PreTokenizedString(const std::string& original)
     : original_(original) {
   splits_.emplace_back(std::move(StringSplit(original_)));
diff --git a/faster_tokenizer/faster_tokenizer/src/tokenizers/ernie_faster_tokenizer.cc b/faster_tokenizer/faster_tokenizer/src/tokenizers/ernie_faster_tokenizer.cc
@@ -93,8 +93,11 @@ void ErnieFasterTokenizer::Init(const core::Vocab& vocab,
                                 bool lowercase,
                                 const std::string& wordpieces_prefix,
                                 uint32_t max_sequence_len) {
-  models::WordPiece wordpiece(
-      vocab, unk_token, 100 /* max_input_chars_per_word */, wordpieces_prefix);
+  models::FasterWordPiece wordpiece(vocab,
+                                    unk_token,
+                                    100 /* max_input_chars_per_word */,
+                                    wordpieces_prefix,
+                                    true);
   this->SetModel(wordpiece);
 
   std::vector<core::AddedToken> added_tokens;
@@ -121,9 +124,6 @@ void ErnieFasterTokenizer::Init(const core::Vocab& vocab,
       clean_text, handle_chinese_chars, strip_accents, lowercase);
   this->SetNormalizer(bert_normalizer);
 
-  pretokenizers::BertPreTokenizer bert_pretokenizer;
-  this->SetPreTokenizer(bert_pretokenizer);
-
   if (vocab.size() > 0) {
     uint32_t sep_id, cls_id;
     if (!this->TokenToId(sep_token, &sep_id)) {
diff --git a/faster_tokenizer/python/faster_tokenizer/__init__.py b/faster_tokenizer/python/faster_tokenizer/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.1.4"
+__version__ = "0.1.5"
 
 from typing import Tuple, Union, Tuple, List
 import sys