Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):

def to_dict(self):
encoder_dict = super().to_dict()
encoder_dict.pop("image_processor_type", None)
encoder_dict["image_processor_type"] = "Ernie4_5_VLImageProcessor"
return encoder_dict

def _preprocess(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
SFT_IMAGE_END_TOKEN,
SFT_IMAGE_START_TOKEN,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding


class VideoProcess(Process):
Expand Down Expand Up @@ -335,7 +336,11 @@ def calculate_ratios_with_min_one(numbers):
# calculate the ratio of each video
text_token_count = 0
for item in meta["text_info"]:
text_token_count += len(self.tokenizer.encode(item["text"])["input_ids"])
tokens = self.tokenizer.encode(item["text"])
if isinstance(tokens, BatchEncoding):
text_token_count += len(tokens["input_ids"])
else:
text_token_count += len(tokens)
text_token_count += 1 # for eos token

if not self.is_training:
Expand Down
7 changes: 6 additions & 1 deletion data_processor/steps/input_ids_messaging/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@

import xxhash
from PIL import Image
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding


def get_text_token_num(tokenizer, text: str):
"""text tokenize and count"""
return len(tokenizer.encode(text)["input_ids"])
tokens = tokenizer.encode(text)
if isinstance(tokens, BatchEncoding):
return len(tokens["input_ids"])
else:
return len(tokens)


def get_uniq_id(text):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
SFT_VIDEO_END_TOKEN,
SFT_VIDEO_START_TOKEN,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding


class SlidingWindowsContextManager:
Expand Down Expand Up @@ -539,7 +540,9 @@ def _text_tokenization(self, sample, dataset_name, data_type):
else:
cur_tokens = self.tokenizer.encode(
item["text"], add_special_tokens=False, return_attention_mask=False
)["input_ids"]
)
if isinstance(cur_tokens, BatchEncoding):
cur_tokens = cur_tokens["input_ids"]
input_ids.append(cur_tokens)

mask_flag = item.get("tag", "no_mask")
Expand Down
5 changes: 4 additions & 1 deletion ernie/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
import paddle
import sentencepiece as spm
from paddleformers.transformers import PretrainedTokenizer
from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, TextInput
from paddleformers.transformers.legacy.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
from paddleformers.utils.log import logger


Expand Down
2 changes: 1 addition & 1 deletion ernie/tokenizer_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from paddleformers.utils.log import logger
from paddleformers.transformers import PretrainedTokenizer
from paddleformers.transformers.tokenizer_utils_base import (
from paddleformers.transformers.legacy.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
Expand Down
21 changes: 13 additions & 8 deletions erniekit/train/vl_sft/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@

import numpy as np
import paddle
from paddleformers.transformers import (
AutoConfig,
)
from paddleformers.transformers import AutoConfig, AutoTokenizer
from paddle.distributed import fleet
from paddleformers.datasets import IterDataset
from paddleformers.trainer import get_last_checkpoint
Expand Down Expand Up @@ -238,11 +236,18 @@ def run_vl_sft(
)
print("data_processor_args:\n", preprocess_args)

tokenizer = Ernie4_5_VLTokenizer.from_pretrained(
model_args.model_name_or_path,
padding_side="right",
model_max_length=data_args.max_seq_len,
)
if convert_from_hf:
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
padding_side="right",
model_max_length=data_args.max_seq_len,
)
else:
tokenizer = Ernie4_5_VLTokenizer.from_pretrained(
model_args.model_name_or_path,
padding_side="right",
model_max_length=data_args.max_seq_len,
)
data_processor = End2EndProcessor(preprocess_args, tokenizer, image_preprocess_save)
data_processor.train().sft()
logger.info(f"[DEBUG] data_processor_args: {preprocess_args}")
Expand Down