forked from huggingface/community-events
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcoyo_1m_dataset_preprocess.py
More file actions
116 lines (94 loc) · 3.2 KB
/
coyo_1m_dataset_preprocess.py
File metadata and controls
116 lines (94 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import logging
import random
import jsonlines
import numpy as np
import requests
from datasets import load_dataset
from PIL import Image
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Example of a data preprocessing script."
)
parser.add_argument(
"--train_data_dir",
type=str,
required=True,
help="The directory to store the dataset",
)
parser.add_argument(
"--cache_dir",
type=str,
required=True,
help="The directory to store cache",
)
parser.add_argument(
"--max_train_samples",
type=int,
default=None,
help="number of examples in the dataset",
)
parser.add_argument(
"--num_proc",
type=int,
default=1,
help="number of processors to use in `dataset.map()`",
)
args = parser.parse_args()
return args
# filter for `max_train_samples``
def filter_function(example):
if example["clip_similarity_vitb32"] < 0.3:
return False
if example["watermark_score"] > 0.4:
return False
if example["aesthetic_score_laion_v2"] < 6.0:
return False
return True
def filter_dataset(dataset, max_train_samples):
small_dataset = dataset.select(range(max_train_samples)).filter(filter_function)
return small_dataset
if __name__ == "__main__":
args = parse_args()
# load coyo-700
dataset = load_dataset(
"kakaobrain/coyo-700m",
cache_dir=args.cache_dir,
split="train",
)
# estimation the % of images filtered
filter_ratio = len(filter_dataset(dataset, 20000)) / 20000
# esimate max_train_samples based on
# (1) filter_ratio we calculuted with 20k examples
# (2) assumption that only 80% of the URLs are still valid
max_train_samples = int(args.max_train_samples / filter_ratio / 0.8)
# filter dataset down to 1 million
small_dataset = filter_dataset(dataset, max_train_samples)
def preprocess_and_save(example):
image_url = example["url"]
try:
# download original image
image = Image.open(requests.get(image_url, stream=True, timeout=5).raw)
image_path = f"{args.train_data_dir}/images/{example['id']}.png"
image.save(image_path)
processed_image = image.convert('L')
processed_image_path = (
f"{args.train_data_dir}/processed_images/{example['id']}.png"
)
processed_image.save(processed_image_path)
# write to meta.jsonl
meta = {
"image": image_path,
"conditioning_image": processed_image_path,
"caption": example["text"],
}
with jsonlines.open(
f"{args.train_data_dir}/meta.jsonl", "a"
) as writer: # for writing
writer.write(meta)
except Exception as e:
logger.error(f"Failed to process image{image_url}: {str(e)}")
# preprocess -> image, processed image and meta.jsonl
small_dataset.map(preprocess_and_save, num_proc=args.num_proc)
print(f"created data folder at: {args.train_data_dir}")