Skip to content

Commit 2c29400

Browse files
authored
Fix faster tokenizer multithreads bug (#3119)
* Add imdb hf datasets * Adjust thucnews load * upgrade faster_tokenizer * Revert "Adjust thucnews load" This reverts commit 677e33a. * Revert "Add imdb hf datasets" This reverts commit 72402f1. * Add CharToBytesOffsetConverter * Change return value of CharToBytesOffsetConverter * Fix some omp directive * fix some multithreads bug
1 parent 0844168 commit 2c29400

File tree

6 files changed

+61
-17
lines changed

6 files changed

+61
-17
lines changed

faster_tokenizer/faster_tokenizer/include/pretokenizers/pretokenizer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,13 @@ struct FASTERTOKENIZER_DECL BytesToCharOffsetConverter
102102
virtual bool convert(const core::Offset&, core::Offset*) const;
103103
};
104104

105+
struct FASTERTOKENIZER_DECL CharToBytesOffsetConverter
106+
: public OffsetConverter {
107+
std::vector<size_t> offset_map_;
108+
CharToBytesOffsetConverter(const std::string&);
109+
virtual bool convert(const core::Offset&, core::Offset*) const;
110+
};
111+
105112
} // namespace pretokenizers
106113
} // namespace faster_tokenizer
107114
} // namespace paddlenlp

faster_tokenizer/faster_tokenizer/src/core/encoding.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ limitations under the License. */
1919
#include <sstream>
2020
#include "glog/logging.h"
2121

22+
#ifdef WITH_OMP
23+
#include <omp.h>
24+
#endif
2225
namespace paddlenlp {
2326
namespace faster_tokenizer {
2427
namespace core {
@@ -627,7 +630,12 @@ void PadEncodings(std::vector<Encoding>* encodings, const PadMethod& method) {
627630
pad_length % method.pad_to_multiple_of_) {
628631
pad_length += pad_length - pad_length % method.pad_to_multiple_of_;
629632
}
630-
for (auto& encoding : *encodings) {
633+
auto batch_size = encodings->size();
634+
#ifdef WITH_OMP
635+
#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
636+
#endif
637+
for (int i = 0; i < batch_size; ++i) {
638+
auto& encoding = (*encodings)[i];
631639
encoding.Pad(pad_length,
632640
method.pad_id_,
633641
method.pad_token_type_id_,

faster_tokenizer/faster_tokenizer/src/core/tokenizer.cc

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -252,17 +252,16 @@ void Tokenizer::EncodeBatchStrings(
252252
const std::vector<EncodeInput>& batch_encode_input,
253253
std::vector<Encoding>* encodings,
254254
bool add_special_tokens) const {
255-
encodings->resize(batch_encode_input.size());
255+
auto batch_size = batch_encode_input.size();
256+
encodings->resize(batch_size);
256257
#ifdef WITH_OMP
257258
// (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
258259
// tokenization.
259260
// Use workload to determine whether create omp threads. Need to optimize the
260261
// workload estimation.
261-
#pragma omp parallel for if (batch_encode_input.size() >= 4 && \
262-
omp_get_num_threads() > \
263-
1)
262+
#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
264263
#endif
265-
for (int i = 0; i < batch_encode_input.size(); ++i) {
264+
for (int i = 0; i < batch_size; ++i) {
266265
EncodePairStrings(
267266
batch_encode_input[i], &(*encodings)[i], add_special_tokens);
268267
}
@@ -294,17 +293,16 @@ void Tokenizer::EncodeBatchStringsCharOffsets(
294293
const std::vector<EncodeInput>& batch_encode_input,
295294
std::vector<Encoding>* encodings,
296295
bool add_special_tokens) const {
297-
encodings->resize(batch_encode_input.size());
296+
auto batch_size = batch_encode_input.size();
297+
encodings->resize(batch_size);
298298
#ifdef WITH_OMP
299299
// (TODO:zhoushunjie): Simply use the batch size to estimate the workload of
300300
// tokenization.
301301
// Use workload to determine whether create omp threads. Need to optimize the
302302
// workload estimation.
303-
#pragma omp parallel for if (batch_encode_input.size() >= 4 && \
304-
omp_get_num_threads() > \
305-
1)
303+
#pragma omp parallel for if (batch_size >= 4 && omp_get_max_threads() > 1)
306304
#endif
307-
for (int i = 0; i < batch_encode_input.size(); ++i) {
305+
for (int i = 0; i < batch_size; ++i) {
308306
Encoding encoding;
309307
EncodePairStringsCharOffsets(
310308
batch_encode_input[i], &encoding, add_special_tokens);

faster_tokenizer/faster_tokenizer/src/pretokenizers/pretokenizer.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,37 @@ bool BytesToCharOffsetConverter::convert(const core::Offset& offset,
5555
return true;
5656
}
5757

58+
59+
CharToBytesOffsetConverter::CharToBytesOffsetConverter(const std::string& seq)
60+
: OffsetConverter(seq) {
61+
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
62+
std::u32string u32seq = conv.from_bytes(seq);
63+
uint32_t index = 0;
64+
offset_map_.reserve(u32seq.length() * 4);
65+
for (int i = 0; i < u32seq.length(); ++i) {
66+
offset_map_.push_back(index);
67+
auto utf8_len = faster_tokenizer::utils::GetUTF8CharLen(u32seq[i]);
68+
index += utf8_len;
69+
}
70+
offset_map_.push_back(index);
71+
}
72+
73+
bool CharToBytesOffsetConverter::convert(const core::Offset& offset,
74+
core::Offset* result) const {
75+
size_t char_start = offset.first;
76+
size_t char_end = offset.second;
77+
if (offset_map_.size() <= char_start) {
78+
return false;
79+
}
80+
auto byte_start = offset_map_.at(char_start);
81+
auto byte_end = byte_start + 1;
82+
if (offset_map_.size() > char_end) {
83+
byte_end = offset_map_.at(char_end);
84+
}
85+
*result = {byte_start, byte_end};
86+
return true;
87+
}
88+
5889
PreTokenizedString::PreTokenizedString(const std::string& original)
5990
: original_(original) {
6091
splits_.emplace_back(std::move(StringSplit(original_)));

faster_tokenizer/faster_tokenizer/src/tokenizers/ernie_faster_tokenizer.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,11 @@ void ErnieFasterTokenizer::Init(const core::Vocab& vocab,
9393
bool lowercase,
9494
const std::string& wordpieces_prefix,
9595
uint32_t max_sequence_len) {
96-
models::WordPiece wordpiece(
97-
vocab, unk_token, 100 /* max_input_chars_per_word */, wordpieces_prefix);
96+
models::FasterWordPiece wordpiece(vocab,
97+
unk_token,
98+
100 /* max_input_chars_per_word */,
99+
wordpieces_prefix,
100+
true);
98101
this->SetModel(wordpiece);
99102

100103
std::vector<core::AddedToken> added_tokens;
@@ -121,9 +124,6 @@ void ErnieFasterTokenizer::Init(const core::Vocab& vocab,
121124
clean_text, handle_chinese_chars, strip_accents, lowercase);
122125
this->SetNormalizer(bert_normalizer);
123126

124-
pretokenizers::BertPreTokenizer bert_pretokenizer;
125-
this->SetPreTokenizer(bert_pretokenizer);
126-
127127
if (vocab.size() > 0) {
128128
uint32_t sep_id, cls_id;
129129
if (!this->TokenToId(sep_token, &sep_id)) {

faster_tokenizer/python/faster_tokenizer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "0.1.4"
15+
__version__ = "0.1.5"
1616

1717
from typing import Tuple, Union, Tuple, List
1818
import sys

0 commit comments

Comments
 (0)