|
| 1 | +"""C4 dataset based on Common Crawl.""" |
| 2 | + |
| 3 | +import os |
| 4 | +import gzip |
| 5 | + |
| 6 | +import requests |
| 7 | +from tqdm import tqdm |
| 8 | + |
| 9 | +BASE_DOWNLOAD_PATH = "/tmp" |
| 10 | + |
| 11 | +_VARIANTS = ["en", "realnewslike", "en.noblocklist", "en.noclean"] |
| 12 | + |
| 13 | +_N_SHARDS_PER_SPLIT = { |
| 14 | + "en": {"train": 1024, "validation": 8}, |
| 15 | + "realnewslike": {"train": 512, "validation": 1}, |
| 16 | + "en.noblocklist": {"train": 1024, "validation": 8}, |
| 17 | + "en.noclean": {"train": 7168, "validation": 64}, |
| 18 | +} |
| 19 | + |
| 20 | +# _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz" |
| 21 | +_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz" |
| 22 | + |
| 23 | + |
| 24 | +def download(url: str, fname: str): |
| 25 | + resp = requests.get(url, stream=True) |
| 26 | + total = int(resp.headers.get('content-length', 0)) |
| 27 | + # Can also replace 'file' with a io.BytesIO object |
| 28 | + with open(fname, 'wb') as file, tqdm( |
| 29 | + desc=fname, |
| 30 | + total=total, |
| 31 | + unit='iB', |
| 32 | + unit_scale=True, |
| 33 | + unit_divisor=1024, |
| 34 | + ) as bar: |
| 35 | + for data in resp.iter_content(chunk_size=1024): |
| 36 | + size = file.write(data) |
| 37 | + bar.update(size) |
| 38 | + |
| 39 | + |
| 40 | +def decompress(infile, tofile): |
| 41 | + with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof: |
| 42 | + decom_str = gzip.decompress(inf.read()).decode('utf-8') |
| 43 | + tof.write(decom_str) |
| 44 | + |
| 45 | + |
| 46 | +if __name__ == "__main__": |
| 47 | + for variant in _VARIANTS: |
| 48 | + print('\n=============================================================') |
| 49 | + print(f'Processing Variant: {variant}') |
| 50 | + |
| 51 | + variant_dir = os.path.join(BASE_DOWNLOAD_PATH, variant) |
| 52 | + |
| 53 | + try: |
| 54 | + os.makedirs(variant_dir) |
| 55 | + except FileExistsError: pass |
| 56 | + |
| 57 | + for split in ["train", "validation"]: |
| 58 | + if split == "train": |
| 59 | + continue |
| 60 | + |
| 61 | + num_shards = _N_SHARDS_PER_SPLIT[variant][split] |
| 62 | + |
| 63 | + print(f"Split: {split}, Shards: {num_shards}") |
| 64 | + |
| 65 | + for index in range(num_shards): |
| 66 | + url = _DATA_URL.format( |
| 67 | + name=variant, |
| 68 | + split=split, |
| 69 | + index=index, |
| 70 | + n_shards=num_shards |
| 71 | + ) |
| 72 | + |
| 73 | + filename = os.path.join(variant_dir, url.split("/")[-1]) |
| 74 | + |
| 75 | + # Downloading the file in GZIP format |
| 76 | + |
| 77 | + if not os.path.isfile(filename): |
| 78 | + print(f"Downloading: {url}...") |
| 79 | + download(url, fname=filename) |
| 80 | + else: |
| 81 | + print(f"Already exists: {filename}...") |
| 82 | + |
| 83 | + # Processing the file from GZIP to JSON |
| 84 | + |
| 85 | + target_file = filename.replace(".gz", "") |
| 86 | + |
| 87 | + if not os.path.isfile(target_file): |
| 88 | + print(f"Decompressing: {filename}...") |
| 89 | + decompress(filename, target_file) |
| 90 | + else: |
| 91 | + print(f"Decompressed file already exists: {target_file}") |
0 commit comments