Skip to content

Commit 908e0f1

Browse files
authored
Fix issues with download and extract (#541)
* Make text field optional in extractor and add tests Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Update documentation on common crawl download Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Add record limit and adjust cli args Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Skip flakey tests Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Update docs Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Clarify typing on output types Signed-off-by: Ryan Wolf <rywolf@nvidia.com> * Remove skipped tests Signed-off-by: Ryan Wolf <rywolf@nvidia.com> --------- Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
1 parent 7042c2c commit 908e0f1

File tree

7 files changed

+656
-180
lines changed

7 files changed

+656
-180
lines changed

docs/user-guide/download.rst

Lines changed: 76 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,42 @@ By "extraction", we typically mean the process of converting a data format from
3737
Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it:
3838

3939
.. code-block:: python
40-
40+
import os
41+
from nemo_curator import get_client
4142
from nemo_curator.download import download_common_crawl
42-
43-
common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")
43+
from nemo_curator.datasets import DocumentDataset
44+
45+
def main():
46+
# Initialize a distributed Dask client
47+
client = get_client(cluster_type="cpu")
48+
49+
# Parameters for downloading Common Crawl data.
50+
# - output_folder: directory for temporary download/extraction files
51+
# - start_snapshot and end_snapshot define the range to fetch
52+
# - output_type: specifies file format for the extracted data (e.g., "jsonl")
53+
output_folder = "/extracted/output/folder"
54+
start_snapshot = "2020-50"
55+
end_snapshot = "2021-04"
56+
output_type = "jsonl"
57+
os.makedirs(output_folder, exist_ok=True)
58+
59+
# Download and extract the Common Crawl data.
60+
# The function returns a DocumentDataset that contains the extracted documents.
61+
# Note: The output folder and output type are passed here to store intermediate files
62+
# and check if the data has already been downloaded. They should match the final location
63+
# and format of the extracted data.
64+
common_crawl_dataset = download_common_crawl(
65+
output_folder, start_snapshot, end_snapshot, output_type=output_type
66+
)
67+
68+
# Write the extracted dataset to JSON format.
69+
# The 'to_json' method will write one JSON document per line,
70+
# preserving the original shard information if write_to_filename is True.
71+
common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
72+
print("Extracted dataset saved to:", output_folder)
73+
74+
if __name__ == "__main__":
75+
main()
4476
4577
* ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
4678
* ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
@@ -50,21 +82,49 @@ By "extraction", we typically mean the process of converting a data format from
5082
You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
5183

5284
.. code-block:: python
53-
85+
import os
86+
from nemo_curator import get_client
5487
from nemo_curator.download import (
55-
ResiliparseExtractor,
56-
download_common_crawl,
57-
)
58-
59-
# Change the extraction algorithm
60-
extraction_algorithm = ResiliparseExtractor()
61-
common_crawl = download_common_crawl(
62-
"/extracted/output/folder",
63-
"2020-50",
64-
"2021-04",
65-
output_type="jsonl",
66-
algorithm=extraction_algorithm,
88+
ResiliparseExtractor,
89+
download_common_crawl,
6790
)
91+
from nemo_curator.datasets import DocumentDataset
92+
93+
def main():
94+
# Initialize a distributed Dask client
95+
client = get_client(cluster_type="cpu")
96+
97+
# Parameters for downloading Common Crawl data.
98+
# - output_folder: directory for temporary download/extraction files
99+
# - start_snapshot and end_snapshot define the range to fetch
100+
# - output_type: specifies file format for the extracted data (e.g., "jsonl")
101+
output_folder = "/extracted/output/folder"
102+
start_snapshot = "2020-50"
103+
end_snapshot = "2021-04"
104+
output_type = "jsonl"
105+
os.makedirs(output_folder, exist_ok=True)
106+
107+
# Change the extraction algorithm to use ResiliparseExtractor
108+
extraction_algorithm = ResiliparseExtractor()
109+
110+
# Download and extract the Common Crawl data using the Resiliparse extraction algorithm.
111+
# The function returns a DocumentDataset that contains the extracted documents.
112+
common_crawl_dataset = download_common_crawl(
113+
output_folder,
114+
start_snapshot,
115+
end_snapshot,
116+
output_type=output_type,
117+
algorithm=extraction_algorithm,
118+
)
119+
120+
# Write the extracted dataset to JSON format.
121+
# The 'to_json' method writes one JSON document per line,
122+
# preserving the original shard information if write_to_filename is True.
123+
common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
124+
print("Extracted dataset saved to:", output_folder)
125+
126+
if __name__ == "__main__":
127+
main()
68128
69129
Above, we changed the extraction algorithm from the default ``JusTextExtractor``.
70130

nemo_curator/download/arxiv.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import subprocess
1919
import tarfile
2020
import tempfile
21+
from typing import Literal, Optional
2122

2223
from nemo_curator.datasets import DocumentDataset
2324
from nemo_curator.download.doc_builder import (
@@ -218,12 +219,12 @@ def extract(self, content):
218219
for file_content in content
219220
)
220221
except Exception:
221-
return {}, None
222+
return None
222223

223224
# Don't return meta
224225
if cleaned_latex_file_str is not None:
225226
if len(cleaned_latex_file_str) > 0:
226-
return {}, cleaned_latex_file_str
227+
return {"text": cleaned_latex_file_str}
227228

228229
def _clean_tex_file(self, file_content, arg_macros, non_arg_macros):
229230
r"""function takes a tex file as input and returns a cleaned version. The
@@ -365,25 +366,44 @@ def _build_non_arg_macros_dict(self, file_content):
365366

366367
def download_arxiv(
367368
output_path: str,
368-
output_type: str = "jsonl",
369-
raw_download_dir=None,
370-
keep_raw_download=False,
371-
force_download=False,
372-
url_limit=None,
369+
output_type: Literal["jsonl", "parquet"] = "jsonl",
370+
raw_download_dir: Optional[str] = None,
371+
keep_raw_download: bool = False,
372+
force_download: bool = False,
373+
url_limit: Optional[int] = None,
374+
record_limit: Optional[int] = None,
373375
) -> DocumentDataset:
374376
"""
375-
Downloads Arxiv tar files and extracts them
377+
Download Arxiv tar files and extract the contained LaTeX projects.
378+
379+
This function obtains a list of Arxiv tar file URLs (via get_arxiv_urls), downloads the tar files,
380+
and then extracts the contained LaTeX source files. The resulting documents (after extraction) are
381+
assembled into a DocumentDataset.
376382
377383
Args:
378-
output_path: The path to the root directory of the files
379-
output_type: The file type to save the data as.
380-
raw_download_dir: Path to store the raw download files for intermediate processing.
381-
If None, they are stored in a folder named "downloads" under output_path.
382-
keep_raw_download: If True, keeps the compressed WARC files that have not been extracted.
383-
force_download: If False, will skip processing all files in output_paths that already exist and
384-
directly read from them instead.
385-
url_limit: The maximum number of raw files to download from the snapshot. If None, all
386-
files from the range of snapshots are downloaded.
384+
output_path (str):
385+
The root directory where both the final extracted files and the raw download subdirectory will be stored.
386+
The extracted files (in the format specified by output_type) are eventually saved in this directory.
387+
output_type (Literal["jsonl", "parquet"], optional):
388+
The file format/extension used for saving the extracted documents (e.g., "jsonl" or "parquet").
389+
Default is "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
390+
raw_download_dir (Optional[str], optional):
391+
The directory where the raw downloaded tar files will be kept. If None, a folder named "downloads"
392+
under output_path is used.
393+
keep_raw_download (bool, optional):
394+
If True, the raw tar files (before extraction) are not removed after processing. Default is False.
395+
force_download (bool, optional):
396+
If False, then if an output file already exists for a given URL, re-downloading and re-extraction will be skipped.
397+
Default is False.
398+
url_limit (Optional[int], optional):
399+
Limits the maximum number of Arxiv tar file URLs to download and process.
400+
If None, all available URLs (from get_arxiv_urls) are processed.
401+
record_limit (Optional[int], optional):
402+
Limits the maximum number of records to extract from each tar file.
403+
If None, all available records are extracted.
404+
Returns:
405+
DocumentDataset:
406+
A dataset object containing the extracted documents.
387407
"""
388408
arxiv_urls = get_arxiv_urls()
389409
if url_limit:
@@ -416,6 +436,7 @@ def download_arxiv(
416436
keep_raw_download=keep_raw_download,
417437
force_download=force_download,
418438
filename_col="file_name",
439+
record_limit=record_limit,
419440
)
420441

421442
return dataset

nemo_curator/download/commoncrawl.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import subprocess
1818
import unicodedata
1919
from abc import ABC, abstractmethod
20+
from typing import Literal, Optional
2021
from urllib.parse import urlparse
2122

2223
import justext
@@ -352,48 +353,54 @@ def extract(self, content):
352353
if text is not None:
353354
if len(text) > 0:
354355
text = "\n\n".join(text)
355-
meta = {"language": lang}
356-
return meta, text
356+
meta = {"language": lang, "text": text}
357+
return meta
357358
else:
358-
return None, None
359+
return None
359360

360361

361362
def download_common_crawl(
362363
output_path: str,
363364
start_snapshot: str,
364365
end_snapshot: str,
365-
output_type: str = "jsonl",
366+
output_type: Literal["jsonl", "parquet"] = "jsonl",
366367
algorithm=JusTextExtractor(),
367-
news=False,
368-
aws=False,
369-
raw_download_dir=None,
370-
keep_raw_download=False,
371-
force_download=False,
372-
url_limit=None,
368+
news: bool = False,
369+
aws: bool = False,
370+
raw_download_dir: Optional[str] = None,
371+
keep_raw_download: bool = False,
372+
force_download: bool = False,
373+
url_limit: Optional[int] = None,
374+
record_limit: Optional[int] = None,
373375
) -> DocumentDataset:
374376
"""
375-
Downloads Common Crawl WARC snapshots and extracts them using jusText or Resiliparse
377+
Downloads Common Crawl WARC snapshots and extracts text content using a specified extraction algorithm.
376378
377379
Args:
378-
output_path: The path to the root directory of the files
379-
start_snapshot: The first common crawl snapshot to include. Snapshots must be
380-
specified by YYYY-WeekNumber (e.g., '2020-50' or '2021-04'). For the CC-NEWS dataset,
381-
(specified with news=True flag) this changes to Year-Month (YYYY-MM).
382-
end_snapshot: The last common crawl snapshot to include. Must be chronologically
383-
after the starting snapshot.
384-
output_type: The file type to save the data as.
385-
algorithm: A JusTextExtractor or ResiliparseExtractor object.
386-
news: If True, gets WARC URLs for the CC-NEWS dataset instead of the CC-MAIN datasets.
387-
Also assumes that the format for the start and end snapshots is 'YYYY-MM' (Year-Month).
388-
aws: Whether to download from Common Crawl's S3 bucket. If True, uses s5cmd to download.
389-
If False, uses wget.
390-
raw_download_dir: Path to store the raw download files for intermediate processing.
391-
If None, they are stored in a folder named "downloads" under output_path.
392-
keep_raw_download: If True, keeps the compressed WARC files that have not been extracted.
393-
force_download: If False, will skip processing all files in output_paths that already exist and
394-
directly read from them instead.
395-
url_limit: The maximum number of raw files to download from the snapshot. If None, all
396-
files from the range of snapshots are downloaded.
380+
output_path (str): The root directory used for managing download and extraction.
381+
• Raw WARC files are stored in a "downloads" subdirectory under this path.
382+
• This path is also checked for existing extraction results; if found, extraction can be skipped.
383+
• Note: This function returns a DocumentDataset, and writing the extracted data to disk is the caller's responsibility.
384+
start_snapshot (str): Identifier for the earliest snapshot to process.
385+
• For CC-MAIN datasets, use the format 'YYYY-WeekNumber' (e.g., '2020-50' or '2021-04').
386+
• For CC-NEWS datasets (when news=True), use the 'YYYY-MM' (Year-Month) format.
387+
end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot.
388+
output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet".
389+
• This is not used for the output file, but is used to check if an extracted output already exists.
390+
algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing.
391+
news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
392+
• This also means snapshot identifiers should follow the 'YYYY-MM' format.
393+
aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd;
394+
• If False, wget is used to fetch the files via HTTPS.
395+
raw_download_dir: Optional; the directory to temporarily store raw WARC files.
396+
• If not provided, defaults to a "downloads" folder within output_path.
397+
keep_raw_download (bool): If True, retains the downloaded raw WARC files after extraction.
398+
• If False, these raw files may be removed following extraction.
399+
force_download (bool): If False, skips re-downloading or re-extracting snapshots if outputs already exist in output_path.
400+
url_limit: Optional; the maximum number of WARC files to download from the snapshot range.
401+
• If None, all available files within the specified snapshots are downloaded.
402+
record_limit: Optional; the maximum number of records to extract from each WARC file.
403+
• If None, all available records are extracted.
397404
"""
398405
common_crawl_urls = get_common_crawl_urls(
399406
starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news
@@ -443,6 +450,7 @@ def download_common_crawl(
443450
keep_raw_download=keep_raw_download,
444451
force_download=force_download,
445452
filename_col="file_name",
453+
record_limit=record_limit,
446454
)
447455

448456
return dataset

0 commit comments

Comments
 (0)