light_controlnet_train/dataset_tools/coyo_1m_dataset_preprocess.py at main · chenbaiyujason/light_controlnet_train · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import logging
import random

import jsonlines
import numpy as np
import requests
from datasets import load_dataset
from PIL import Image

logger = logging.getLogger(__name__)


def parse_args():
    parser = argparse.ArgumentParser(
        description="Example of a data preprocessing script."
    )
    parser.add_argument(
        "--train_data_dir",
        type=str,
        required=True,
        help="The directory to store the dataset",
    )
    parser.add_argument(
        "--cache_dir",
        type=str,
        required=True,
        help="The directory to store cache",
    )
    parser.add_argument(
        "--max_train_samples",
        type=int,
        default=None,
        help="number of examples in the dataset",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=1,
        help="number of processors to use in `dataset.map()`",
    )

    args = parser.parse_args()
    return args


# filter for `max_train_samples``
def filter_function(example):
    if example["clip_similarity_vitb32"] < 0.3:
        return False
    if example["watermark_score"] > 0.4:
        return False
    if example["aesthetic_score_laion_v2"] < 6.0:
        return False
    return True


def filter_dataset(dataset, max_train_samples):
    small_dataset = dataset.select(range(max_train_samples)).filter(filter_function)
    return small_dataset


if __name__ == "__main__":
    args = parse_args()

    # load coyo-700
    dataset = load_dataset(
        "kakaobrain/coyo-700m",
        cache_dir=args.cache_dir,
        split="train",
    )

    # estimation the % of images filtered
    filter_ratio = len(filter_dataset(dataset, 20000)) / 20000

    # esimate max_train_samples based on
    #   (1) filter_ratio we calculuted with 20k examples
    #   (2) assumption that only 80% of the URLs are still valid
    max_train_samples = int(args.max_train_samples / filter_ratio / 0.8)

    # filter dataset down to 1 million
    small_dataset = filter_dataset(dataset, max_train_samples)

    def preprocess_and_save(example):
        image_url = example["url"]
        try:
            # download original image
            image = Image.open(requests.get(image_url, stream=True, timeout=5).raw)
            image_path = f"{args.train_data_dir}/images/{example['id']}.png"
            image.save(image_path)

            processed_image = image.convert('L')

            processed_image_path = (
                f"{args.train_data_dir}/processed_images/{example['id']}.png"
            )
            processed_image.save(processed_image_path)

            # write to meta.jsonl
            meta = {
                "image": image_path,
                "conditioning_image": processed_image_path,
                "caption": example["text"],
            }
            with jsonlines.open(
                f"{args.train_data_dir}/meta.jsonl", "a"
            ) as writer:  # for writing
                writer.write(meta)

        except Exception as e:
            logger.error(f"Failed to process image{image_url}: {str(e)}")

    # preprocess -> image, processed image and meta.jsonl
    small_dataset.map(preprocess_and_save, num_proc=args.num_proc)

    print(f"created data folder at: {args.train_data_dir}")