From 57b31048f9579505c170cd6f375327607ebdee79 Mon Sep 17 00:00:00 2001 From: pkhk-1 Date: Mon, 17 Feb 2025 09:04:15 +0000 Subject: [PATCH] [wip] data loss for ppdocbee --- .../ppdocbee/configs/data_loss_chart_en.json | 7 + .../examples/ppdocbee/ppdocbee_data_loss.py | 642 ++++++++++++++++++ .../ppdocbee/shell/ppdocbee_data_loss.sh | 82 +++ .../models/qwen2_vl/modeling_qwen2_vl.py | 2 +- 4 files changed, 732 insertions(+), 1 deletion(-) create mode 100644 paddlemix/examples/ppdocbee/configs/data_loss_chart_en.json create mode 100644 paddlemix/examples/ppdocbee/ppdocbee_data_loss.py create mode 100644 paddlemix/examples/ppdocbee/shell/ppdocbee_data_loss.sh diff --git a/paddlemix/examples/ppdocbee/configs/data_loss_chart_en.json b/paddlemix/examples/ppdocbee/configs/data_loss_chart_en.json new file mode 100644 index 000000000..097290538 --- /dev/null +++ b/paddlemix/examples/ppdocbee/configs/data_loss_chart_en.json @@ -0,0 +1,7 @@ +{ + "chart_en_32k": { + "annotation": "chart_en_32k/chart_en_31919.json", + "repeat_time": 1 + } + } + \ No newline at end of file diff --git a/paddlemix/examples/ppdocbee/ppdocbee_data_loss.py b/paddlemix/examples/ppdocbee/ppdocbee_data_loss.py new file mode 100644 index 000000000..fc04b4302 --- /dev/null +++ b/paddlemix/examples/ppdocbee/ppdocbee_data_loss.py @@ -0,0 +1,642 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import math +import os +import random +import sys +import traceback +from dataclasses import dataclass, field +from typing import Dict, Optional + +import numpy as np +import paddle +import paddle.distributed as dist +from paddle.io import Dataset +from paddlenlp.data import DataCollatorForSeq2Seq +from paddlenlp.peft import LoRAConfig, LoRAModel +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, set_seed +from paddlenlp.trainer.trainer import Trainer +from paddlenlp.trainer.trainer_utils import get_last_checkpoint +from PIL import Image, ImageFile, PngImagePlugin, UnidentifiedImageError + +from paddlemix.datasets.internvl_dataset import ConcatDataset, WeightedConcatDataset +from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer +from paddlemix.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration +from paddlemix.models.qwen2_vl.supervised import _encode_supervised_example +from paddlemix.models.qwen2_vl.template import TEMPLATES +from paddlemix.processors.qwen2_vl_processing import ( + Qwen2VLImageProcessor, + Qwen2VLProcessor, +) + +Image.MAX_IMAGE_PIXELS = None +ImageFile.LOAD_TRUNCATED_IMAGES = True +MaximumDecompressedSize = 1024 +MegaByte = 2**20 +PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte + +logger = logging.getLogger(__name__) + + +# Set constants for image processing and logging +IGNORE_INDEX = -100 +VIDEO_PLACEHOLDER = "