|
1 | | -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. |
| 1 | +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
2 | 2 | # |
3 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
4 | 4 | # you may not use this file except in compliance with the License. |
|
17 | 17 | import subprocess |
18 | 18 | import unicodedata |
19 | 19 | from abc import ABC, abstractmethod |
| 20 | +from copy import deepcopy |
20 | 21 | from typing import Literal, Optional |
21 | 22 | from urllib.parse import urlparse |
22 | 23 |
|
|
25 | 26 | import pycld2 as cld2 |
26 | 27 | from charset_normalizer import detect |
27 | 28 | from resiliparse.extract.html2text import extract_plain_text |
| 29 | +from trafilatura import extract as extract_with_trafilatura |
| 30 | +from trafilatura.settings import DEFAULT_CONFIG as TRAFILATURA_DEFAULT_CONFIG |
28 | 31 | from warcio.archiveiterator import ArchiveIterator |
29 | 32 |
|
30 | 33 | from nemo_curator.datasets import DocumentDataset |
@@ -92,6 +95,26 @@ def __init__( |
92 | 95 | """ |
93 | 96 | Initialize the jusText text extraction algorithm with specified parameters. |
94 | 97 |
|
| 98 | + jusText is a tool for removing boilerplate content, such as navigation links, headers, and footers from HTML pages. |
| 99 | + It is designed to preserve mainly text containing full sentences and it is therefore well suited for creating linguistic resources such as Web corpora. |
| 100 | + The key idea is that long blocks can often be classified with high confidence, while shorter blocks require context-based adjustments. |
| 101 | +
|
| 102 | + Here is an overview of the jusText algorithm: |
| 103 | + • Segmentation: The document is split into textual blocks based on HTML tags that typically define separate sections (e.g., <div>, <p>, <table>). |
| 104 | + • Preprocessing: Contents of <header>, <style>, and <script> tags are removed. |
| 105 | + Certain elements (e.g., <select>, copyright symbols) are immediately classified as boilerplate. |
| 106 | + • Context-Free Classification: Each block is classified as: |
| 107 | + - Bad (boilerplate) if it has high link density. |
| 108 | + - Short if it is too small to be classified reliably. |
| 109 | + - Near-Good if it has a moderate density of stopwords. |
| 110 | + - Good (main content) if it is long and contains many stopwords. |
| 111 | + • Context-Sensitive Classification: Blocks that were classified as short or near-good are reclassified based on surrounding blocks. |
| 112 | + The assumption is that main content clusters together, as does boilerplate. |
| 113 | + • Headings Processing: Header elements (e.g., <h1>, <h2>) are treated separately to ensure useful headings are preserved. |
| 114 | + Short headers near good content may be reclassified as near-good or good. |
| 115 | +
|
| 116 | + Please refer to the jusText documentation for more details: https://corpus.tools/wiki/Justext/Algorithm |
| 117 | +
|
95 | 118 | Args: |
96 | 119 | length_low: Minimum length of text to be considered for extraction. |
97 | 120 | length_high: Maximum length of text to be considered for extraction. |
@@ -165,6 +188,18 @@ def __init__( |
165 | 188 | """ |
166 | 189 | Initialize the Resiliparse text extraction algorithm with specified parameters. |
167 | 190 |
|
| 191 | + The Resiliparse algorithm extracts structural or semantic information from noisy raw web data for further processing, |
| 192 | + such as (main) content extraction / boilerplate removal, schema extraction, general web data cleansing, and more. |
| 193 | +
|
| 194 | + It is implemented via the `extract_plain_text` function in the `resiliparse.extract.html2text` module. |
| 195 | + Resiliparse HTML2Text is a very fast and rule-based plain text extractor for HTML pages which uses the Resiliparse DOM parser. |
| 196 | + The `extract_plain_text` function extracts all visible text nodes inside the HTML document's <body>. |
| 197 | + Only <script>, <style> and a few other (generally) invisible elements are skipped and very basic ASCII formatting is applied. |
| 198 | +
|
| 199 | + Please refer to the Resiliparse documentation for more details: https://resiliparse.chatnoir.eu/en/latest/man/extract/html2text.html |
| 200 | +
|
| 201 | + NeMo Curator has added a stopword density filter to the Resiliparse extraction process, which requires that a paragraph contains a certain proportion of stopwords. |
| 202 | +
|
168 | 203 | Args: |
169 | 204 | required_stopword_density: Proportion of stopwords required preserve an extracted paragraph. |
170 | 205 | Studies on stopword lists and their distribution in various text corpora often |
@@ -200,6 +235,118 @@ def extract_text(self, html, stop_words): |
200 | 235 | return result |
201 | 236 |
|
202 | 237 |
|
| 238 | +class TrafilaturaExtractor(HTMLExtractorAlgorithm): |
| 239 | + def __init__( |
| 240 | + self, |
| 241 | + required_stopword_density=0.32, |
| 242 | + min_extracted_size=250, |
| 243 | + min_extracted_comm_size=1, |
| 244 | + min_output_size=1, |
| 245 | + min_output_comm_size=1, |
| 246 | + max_tree_size=None, |
| 247 | + min_duplcheck_size=100, |
| 248 | + max_repetitions=2, |
| 249 | + **extract_kwargs, |
| 250 | + ): |
| 251 | + """ |
| 252 | + Initialize the Trafilatura text extraction algorithm with specified parameters. |
| 253 | +
|
| 254 | + The Trafilatura extraction process combines readability-lxml and jusText as fallbacks to ensure robustness. |
| 255 | + Trafilatura's own algorithm follows a cascade of rule-based filters and content heuristics: |
| 256 | + • Content Delimitation: Uses XPath expressions to exclude unwanted HTML elements (e.g., navigation bars) and focus on relevant content (e.g., article body). |
| 257 | + Extracted HTML nodes are analyzed for relevance based on element type, text length, and link density. |
| 258 | + • Fallback Mechanism: If extraction seems faulty, alternative algorithms are run as backups. |
| 259 | + These use heuristics like line length, text-to-markup ratio, and HTML depth to improve extraction. |
| 260 | + Outputs are compared, prioritizing longer extractions with fewer impurities. |
| 261 | + • Baseline Extraction: If all else fails, it searches for text elements that might have been missed, discarding irrelevant content. |
| 262 | +
|
| 263 | + The system balances precision and recall, extracting main text, comments, and metadata (title, site name, author, date, categories, tags). |
| 264 | +
|
| 265 | + Please refer to the Trafilatura documentation for more details: |
| 266 | + https://trafilatura.readthedocs.io/en/latest/ and https://aclanthology.org/2021.acl-demo.15/ |
| 267 | +
|
| 268 | + NeMo Curator has added a stopword density filter to the Trafilatura extraction process, which requires that a paragraph contains a certain proportion of stopwords. |
| 269 | +
|
| 270 | + Args: |
| 271 | + required_stopword_density: Proportion of stopwords required preserve an extracted paragraph. |
| 272 | + Studies on stopword lists and their distribution in various text corpora often |
| 273 | + suggest that around 30-40% of a typical English text consists of stopwords. |
| 274 | + min_extracted_size: Acceptable size in characters (used to trigger fallbacks). |
| 275 | + Defaults to 250. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 276 | + min_extracted_comm_size: Works the same as min_output_comm_size for comment extraction. |
| 277 | + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 278 | + min_output_size: Absolute acceptable minimum for main text output. |
| 279 | + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 280 | + min_output_comm_size: Works the same as min_output_comm_size for comment extraction. |
| 281 | + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 282 | + max_tree_size: Used to discard documents with too many elements. Defaults to None. |
| 283 | + min_duplcheck_size: Minimum size in characters to run deduplication on. |
| 284 | + Defaults to 100. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 285 | + max_repetitions: Maximum number of duplicates allowed. |
| 286 | + Defaults to 2. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. |
| 287 | + extract_kwargs: Additional keyword arguments for the Trafilatura extract function. |
| 288 | + See API documentation https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract |
| 289 | + for list of possible parameters. |
| 290 | + All arguments are set to their default values, except for deduplicate (bool) which is set to True. |
| 291 | +
|
| 292 | + """ |
| 293 | + self.required_stopword_density = required_stopword_density |
| 294 | + self.min_extracted_size = min_extracted_size |
| 295 | + self.min_extracted_comm_size = min_extracted_comm_size |
| 296 | + self.min_output_size = min_output_size |
| 297 | + self.min_output_comm_size = min_output_comm_size |
| 298 | + self.max_tree_size = max_tree_size |
| 299 | + self.min_duplcheck_size = min_duplcheck_size |
| 300 | + self.max_repetitions = max_repetitions |
| 301 | + self.extract_kwargs = extract_kwargs |
| 302 | + |
| 303 | + def extract_text(self, html, stop_words): |
| 304 | + trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG) |
| 305 | + trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str( |
| 306 | + self.min_extracted_size |
| 307 | + ) |
| 308 | + trafilatura_config["DEFAULT"]["MIN_EXTRACTED_COMM_SIZE"] = str( |
| 309 | + self.min_extracted_comm_size |
| 310 | + ) |
| 311 | + trafilatura_config["DEFAULT"]["MIN_OUTPUT_SIZE"] = str(self.min_output_size) |
| 312 | + trafilatura_config["DEFAULT"]["MIN_OUTPUT_COMM_SIZE"] = str( |
| 313 | + self.min_output_comm_size |
| 314 | + ) |
| 315 | + if self.max_tree_size: |
| 316 | + trafilatura_config["DEFAULT"]["MAX_TREE_SIZE"] = str(self.max_tree_size) |
| 317 | + trafilatura_config["DEFAULT"]["MIN_DUPLCHECK_SIZE"] = str( |
| 318 | + self.min_duplcheck_size |
| 319 | + ) |
| 320 | + trafilatura_config["DEFAULT"]["MAX_REPETITIONS"] = str(self.max_repetitions) |
| 321 | + |
| 322 | + # Recommended to set deduplicate=True |
| 323 | + self.extract_kwargs.setdefault("deduplicate", True) |
| 324 | + |
| 325 | + text = extract_with_trafilatura( |
| 326 | + html, config=trafilatura_config, **self.extract_kwargs |
| 327 | + ) |
| 328 | + |
| 329 | + if text is not None: |
| 330 | + paragraphs = list(filter(None, text.split("\n"))) |
| 331 | + result = [] |
| 332 | + for paragraph in paragraphs: |
| 333 | + words = paragraph.split() |
| 334 | + length = len(words) |
| 335 | + if length == 0: |
| 336 | + continue |
| 337 | + stopwords = [word for word in words if word in stop_words] |
| 338 | + stopword_density = len(stopwords) / length |
| 339 | + |
| 340 | + if stopword_density >= self.required_stopword_density: |
| 341 | + result.append(paragraph) |
| 342 | + else: |
| 343 | + return None |
| 344 | + |
| 345 | + if len(result) == 0: |
| 346 | + return None |
| 347 | + return result |
| 348 | + |
| 349 | + |
203 | 350 | def get_stop_list_dict(languages=[]): |
204 | 351 |
|
205 | 352 | # Name mapping for language names from CLD2 (values) |
@@ -387,7 +534,8 @@ def download_common_crawl( |
387 | 534 | end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot. |
388 | 535 | output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet". |
389 | 536 | • This is not used for the output file, but is used to check if an extracted output already exists. |
390 | | - algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing. |
| 537 | + algorithm: The text extraction algorithm instance to use for HTML processing. |
| 538 | + • This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object. |
391 | 539 | news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset. |
392 | 540 | • This also means snapshot identifiers should follow the 'YYYY-MM' format. |
393 | 541 | aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd; |
|
0 commit comments