Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit 4e6a59b

Browse files
author
DEKHTIARJonathan
committed
[Benchmarking Py] C4 Dataset Added
1 parent 753a731 commit 4e6a59b

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""C4 dataset based on Common Crawl."""
2+
3+
import os
4+
import gzip
5+
6+
import requests
7+
from tqdm import tqdm
8+
9+
BASE_DOWNLOAD_PATH = "/tmp"
10+
11+
_VARIANTS = ["en", "realnewslike", "en.noblocklist", "en.noclean"]
12+
13+
_N_SHARDS_PER_SPLIT = {
14+
"en": {"train": 1024, "validation": 8},
15+
"realnewslike": {"train": 512, "validation": 1},
16+
"en.noblocklist": {"train": 1024, "validation": 8},
17+
"en.noclean": {"train": 7168, "validation": 64},
18+
}
19+
20+
# _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
21+
_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
22+
23+
24+
def download(url: str, fname: str):
25+
resp = requests.get(url, stream=True)
26+
total = int(resp.headers.get('content-length', 0))
27+
# Can also replace 'file' with a io.BytesIO object
28+
with open(fname, 'wb') as file, tqdm(
29+
desc=fname,
30+
total=total,
31+
unit='iB',
32+
unit_scale=True,
33+
unit_divisor=1024,
34+
) as bar:
35+
for data in resp.iter_content(chunk_size=1024):
36+
size = file.write(data)
37+
bar.update(size)
38+
39+
40+
def decompress(infile, tofile):
41+
with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof:
42+
decom_str = gzip.decompress(inf.read()).decode('utf-8')
43+
tof.write(decom_str)
44+
45+
46+
if __name__ == "__main__":
47+
for variant in _VARIANTS:
48+
print('\n=============================================================')
49+
print(f'Processing Variant: {variant}')
50+
51+
variant_dir = os.path.join(BASE_DOWNLOAD_PATH, variant)
52+
53+
try:
54+
os.makedirs(variant_dir)
55+
except FileExistsError: pass
56+
57+
for split in ["train", "validation"]:
58+
if split == "train":
59+
continue
60+
61+
num_shards = _N_SHARDS_PER_SPLIT[variant][split]
62+
63+
print(f"Split: {split}, Shards: {num_shards}")
64+
65+
for index in range(num_shards):
66+
url = _DATA_URL.format(
67+
name=variant,
68+
split=split,
69+
index=index,
70+
n_shards=num_shards
71+
)
72+
73+
filename = os.path.join(variant_dir, url.split("/")[-1])
74+
75+
# Downloading the file in GZIP format
76+
77+
if not os.path.isfile(filename):
78+
print(f"Downloading: {url}...")
79+
download(url, fname=filename)
80+
else:
81+
print(f"Already exists: {filename}...")
82+
83+
# Processing the file from GZIP to JSON
84+
85+
target_file = filename.replace(".gz", "")
86+
87+
if not os.path.isfile(target_file):
88+
print(f"Decompressing: {filename}...")
89+
decompress(filename, target_file)
90+
else:
91+
print(f"Decompressed file already exists: {target_file}")

0 commit comments

Comments
 (0)