Skip to content

Commit b55e437

Browse files
authored
Merge pull request #1390 from BossPi/formers_tokenizer
Formers tokenizer
2 parents 6357e91 + f00e20d commit b55e437

File tree

7 files changed

+35
-14
lines changed

7 files changed

+35
-14
lines changed

data_processor/image_preprocessor/image_preprocessor_adaptive.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
241241

242242
def to_dict(self):
243243
encoder_dict = super().to_dict()
244-
encoder_dict.pop("image_processor_type", None)
244+
encoder_dict["image_processor_type"] = "Ernie4_5_VLImageProcessor"
245245
return encoder_dict
246246

247247
def _preprocess(

data_processor/steps/input_ids_messaging/data_process/process_video.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
SFT_IMAGE_END_TOKEN,
3434
SFT_IMAGE_START_TOKEN,
3535
)
36+
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding
3637

3738

3839
class VideoProcess(Process):
@@ -335,7 +336,11 @@ def calculate_ratios_with_min_one(numbers):
335336
# calculate the ratio of each video
336337
text_token_count = 0
337338
for item in meta["text_info"]:
338-
text_token_count += len(self.tokenizer.encode(item["text"])["input_ids"])
339+
tokens = self.tokenizer.encode(item["text"])
340+
if isinstance(tokens, BatchEncoding):
341+
text_token_count += len(tokens["input_ids"])
342+
else:
343+
text_token_count += len(tokens)
339344
text_token_count += 1 # for eos token
340345

341346
if not self.is_training:

data_processor/steps/input_ids_messaging/data_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,16 @@
2222

2323
import xxhash
2424
from PIL import Image
25+
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding
2526

2627

2728
def get_text_token_num(tokenizer, text: str):
2829
"""text tokenize and count"""
29-
return len(tokenizer.encode(text)["input_ids"])
30+
tokens = tokenizer.encode(text)
31+
if isinstance(tokens, BatchEncoding):
32+
return len(tokens["input_ids"])
33+
else:
34+
return len(tokens)
3035

3136

3237
def get_uniq_id(text):

data_processor/steps/input_ids_messaging/example_to_feature.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
SFT_VIDEO_END_TOKEN,
5252
SFT_VIDEO_START_TOKEN,
5353
)
54+
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding
5455

5556

5657
class SlidingWindowsContextManager:
@@ -539,7 +540,9 @@ def _text_tokenization(self, sample, dataset_name, data_type):
539540
else:
540541
cur_tokens = self.tokenizer.encode(
541542
item["text"], add_special_tokens=False, return_attention_mask=False
542-
)["input_ids"]
543+
)
544+
if isinstance(cur_tokens, BatchEncoding):
545+
cur_tokens = cur_tokens["input_ids"]
543546
input_ids.append(cur_tokens)
544547

545548
mask_flag = item.get("tag", "no_mask")

ernie/tokenizer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
import paddle
2525
import sentencepiece as spm
2626
from paddleformers.transformers import PretrainedTokenizer
27-
from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, TextInput
27+
from paddleformers.transformers.legacy.tokenizer_utils_base import (
28+
PaddingStrategy,
29+
TextInput,
30+
)
2831
from paddleformers.utils.log import logger
2932

3033

ernie/tokenizer_vl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
from paddleformers.utils.log import logger
3131
from paddleformers.transformers import PretrainedTokenizer
32-
from paddleformers.transformers.tokenizer_utils_base import (
32+
from paddleformers.transformers.legacy.tokenizer_utils_base import (
3333
PaddingStrategy,
3434
TextInput,
3535
)

erniekit/train/vl_sft/workflow.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@
2020

2121
import numpy as np
2222
import paddle
23-
from paddleformers.transformers import (
24-
AutoConfig,
25-
)
23+
from paddleformers.transformers import AutoConfig, AutoTokenizer
2624
from paddle.distributed import fleet
2725
from paddleformers.datasets import IterDataset
2826
from paddleformers.trainer import get_last_checkpoint
@@ -238,11 +236,18 @@ def run_vl_sft(
238236
)
239237
print("data_processor_args:\n", preprocess_args)
240238

241-
tokenizer = Ernie4_5_VLTokenizer.from_pretrained(
242-
model_args.model_name_or_path,
243-
padding_side="right",
244-
model_max_length=data_args.max_seq_len,
245-
)
239+
if convert_from_hf:
240+
tokenizer = AutoTokenizer.from_pretrained(
241+
model_args.model_name_or_path,
242+
padding_side="right",
243+
model_max_length=data_args.max_seq_len,
244+
)
245+
else:
246+
tokenizer = Ernie4_5_VLTokenizer.from_pretrained(
247+
model_args.model_name_or_path,
248+
padding_side="right",
249+
model_max_length=data_args.max_seq_len,
250+
)
246251
data_processor = End2EndProcessor(preprocess_args, tokenizer, image_preprocess_save)
247252
data_processor.train().sft()
248253
logger.info(f"[DEBUG] data_processor_args: {preprocess_args}")

0 commit comments

Comments
 (0)