NVIDIA-NeMo
diff --git a/‎docs/user-guide/download.rst‎
Lines changed: 24 additions & 4 deletions b/‎docs/user-guide/download.rst‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎nemo_curator/download/commoncrawl.py‎
Lines changed: 114 additions & 40 deletions b/‎nemo_curator/download/commoncrawl.py‎
Lines changed: 114 additions & 40 deletions
@@ -80,7 +80,7 @@ By "extraction", we typically mean the process of converting a data format from
 * ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
 * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.
 
-You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
+  You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
 
 .. code-block:: python
 
@@ -133,13 +133,33 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c
 
 Above, we changed the extraction algorithm from the default ``JusTextExtractor``. **Note:** The JusTextExtractor, ResiliparseExtractor, and TrafilaturaExtractor classes each have their own unique parameters which are specific to their extraction algorithms. Please see the docstrings for each class for more details.
 
+You can set your own dictionary of stop words by language to be used when extracting text:
+
+.. code-block:: python
+
+  from nemo_curator.download import download_common_crawl
+
+  # Change the default stop list used
+  stop_lists = {"ENGLISH": frozenset(["the", "and", "is", "in", "for", "where", "when", "to", "at"])}
+
+  common_crawl = download_common_crawl(
+      "/extracted/output/folder",
+      "2020-50",
+      "2021-04",
+      output_type="jsonl",
+      stop_lists=stop_lists,
+  )
+
+This may be desirable to further customize your text extraction pipeline, or to enable text extraction support for languages not included by jusText and NeMo Curator.
+
 The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use.
 
 NeMo Curator's Common Crawl extraction process looks like this under the hood:
 
- 1. Decode the HTML within the record from binary to text.
- 2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
- 3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_, `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_, or `Trafilatura <https://trafilatura.readthedocs.io/en/latest/>`_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file.
+1. Decode the HTML within the record from binary to text.
+2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
+3. Finally, the extract the relevant text with `jusText <https://github.com/miso-belica/jusText>`_, `Resiliparse <https://github.com/chatnoir-eu/chatnoir-resiliparse>`_, or `Trafilatura <https://trafilatura.readthedocs.io/en/latest/>`_ from the HTML and write it out as a single string within the "text" field of a JSON entry within a ``.jsonl`` file.
+
 * ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address.
 
   .. code-block:: python
 
@@ -16,6 +16,7 @@
 import os
 import subprocess
 import unicodedata
+import warnings
 from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Literal, Optional
@@ -40,6 +41,8 @@
 from nemo_curator.utils.download_utils import get_common_crawl_urls
 from nemo_curator.utils.file_utils import expand_outdir_and_mkdir
 
+NON_SPACED_LANGUAGES = ["THAI", "CHINESE", "JAPANESE", "KOREAN"]
+
 
 def decode_html(html_bytes):
     # Convert from bytes to text using utf-8 encoding
@@ -76,7 +79,7 @@ def lang_detect(decoded_html):
 
 class HTMLExtractorAlgorithm(ABC):
     @abstractmethod
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         pass
 
 
@@ -90,6 +93,7 @@ def __init__(
         max_link_density=0.2,
         max_heading_distance=200,
         no_headings=False,
+        is_boilerplate=None,
         logger=None,
     ):
         """
@@ -123,6 +127,9 @@ def __init__(
             max_link_density: Maximum allowed link density in the text.
             max_heading_distance: Maximum distance from a heading to consider text for extraction.
             no_headings: If True, text extraction will ignore headings.
+            is_boilerplate: If True, text extraction will ignore boilerplate content.
+                Default is True for space-separated languages and False for non-space-separated languages
+                (Thai, Chinese, Japanese, and Korean).
             logger: Optional logger instance for logging messages.
 
         """
@@ -133,9 +140,10 @@ def __init__(
         self.max_link_density = max_link_density
         self.max_heading_distance = max_heading_distance
         self.no_headings = no_headings
+        self.is_boilerplate = is_boilerplate
         self.logger = logger
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         # Segment the HTML into paragraphs
         try:
             # Form the DOM tree
@@ -149,6 +157,7 @@ def extract_text(self, html, stop_words):
             if self.logger is not None:
                 self.logger.info("Could not segment paragaphs in the document")
             return
+
         paragraphs = handler.paragraphs
 
         # Context free classification
@@ -175,7 +184,21 @@ def extract_text(self, html, stop_words):
             self.max_heading_distance,
         )
 
-        return [p.text for p in paragraphs if not p.is_boilerplate]
+        if self.is_boilerplate is None:
+            if language in NON_SPACED_LANGUAGES:
+                warnings.warn("Disabling is_boilerplate check for jusText extraction.")
+                is_boilerplate = False
+            else:
+                is_boilerplate = True
+
+        else:
+            is_boilerplate = self.is_boilerplate
+
+        if is_boilerplate:
+            return [p.text for p in paragraphs if not p.is_boilerplate]
+
+        else:
+            return [p.text for p in paragraphs]
 
 
 class ResiliparseExtractor(HTMLExtractorAlgorithm):
@@ -212,26 +235,34 @@ def __init__(
         self.main_content = main_content
         self.alt_texts = alt_texts
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         text = extract_plain_text(
             html, main_content=self.main_content, alt_texts=self.alt_texts
         )
 
         paragraphs = list(filter(None, text.split("\n")))
-        result = []
-        for paragraph in paragraphs:
-            words = paragraph.split()
-            length = len(words)
-            if length == 0:
-                continue
-            stopwords = [word for word in words if word in stop_words]
-            stopword_density = len(stopwords) / length
 
-            if stopword_density >= self.required_stopword_density:
-                result.append(paragraph)
+        if language in NON_SPACED_LANGUAGES:
+            warnings.warn(
+                "stopword_density is ignored for non-space-separated languages."
+            )
+            result = paragraphs
+        else:
+            result = []
+
+            for paragraph in paragraphs:
+                words = paragraph.split()
+                length = len(words)
+
+                if length == 0:
+                    continue
+
+                stopwords = [word for word in words if word in stop_words]
+                stopword_density = len(stopwords) / length
+
+                if stopword_density >= self.required_stopword_density:
+                    result.append(paragraph)
 
-        if len(result) == 0:
-            return None
         return result
 
 
@@ -300,7 +331,7 @@ def __init__(
         self.max_repetitions = max_repetitions
         self.extract_kwargs = extract_kwargs
 
-    def extract_text(self, html, stop_words):
+    def extract_text(self, html, stop_words, language):
         trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG)
         trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str(
             self.min_extracted_size
@@ -328,17 +359,29 @@ def extract_text(self, html, stop_words):
 
         if text is not None:
             paragraphs = list(filter(None, text.split("\n")))
-            result = []
-            for paragraph in paragraphs:
-                words = paragraph.split()
-                length = len(words)
-                if length == 0:
-                    continue
-                stopwords = [word for word in words if word in stop_words]
-                stopword_density = len(stopwords) / length
 
-                if stopword_density >= self.required_stopword_density:
-                    result.append(paragraph)
+            if language in NON_SPACED_LANGUAGES:
+                warnings.warn(
+                    "stopword_density is ignored for non-space-separated languages."
+                )
+                result = paragraphs
+
+            else:
+                result = []
+
+                for paragraph in paragraphs:
+                    words = paragraph.split()
+                    length = len(words)
+
+                    if length == 0:
+                        continue
+
+                    stopwords = [word for word in words if word in stop_words]
+                    stopword_density = len(stopwords) / length
+
+                    if stopword_density >= self.required_stopword_density:
+                        result.append(paragraph)
+
         else:
             return None
 
@@ -357,25 +400,47 @@ def get_stop_list_dict(languages=[]):
         "Norwegian_Nynorsk": "NORWEGIAN_N",
         "Waray_Waray": "WARAY_PHILIPPINES",
     }
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-ja
+    from .ja_stopwords import ja_stopwords
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-th
+    from .th_stopwords import th_stopwords
+
+    # List obtained from https://github.com/stopwords-iso/stopwords-zh
+    from .zh_stopwords import zh_stopwords
+
+    custom_stopwords = {
+        "THAI": th_stopwords,
+        "CHINESE": zh_stopwords,
+        "JAPANESE": ja_stopwords,
+    }
+
     if len(languages) == 0:
         languages = justext.get_stoplists()
-        # Remove latin as it yields a lot of low quality documents
-        languages_no_latin = list(languages)
-        languages_no_latin.remove("Latin")
-        languages = frozenset(languages_no_latin)
+
+        # Remove Latin as it yields a lot of low quality documents
+        languages = list(languages)
+        languages.remove("Latin")
+
+        # Manually add Thai, Chinese, and Japanese
+        languages.append("THAI")
+        languages.append("CHINESE")
+        languages.append("JAPANESE")
+
+        languages = frozenset(languages)
 
     stop_list_dict = {}
     for language in languages:
         if language in lang_map:
             lang_key = lang_map[language]
         else:
             lang_key = language.upper()
-        stop_list_dict[lang_key] = justext.get_stoplist(language)
-
-    # List obtained from https://github.com/stopwords-iso/stopwords-th
-    from .thai_stopwords import thai_stopwords
 
-    stop_list_dict["THAI"] = thai_stopwords
+        if lang_key in custom_stopwords:
+            stop_list_dict[lang_key] = custom_stopwords[lang_key]
+        else:
+            stop_list_dict[lang_key] = justext.get_stoplist(language)
 
     return stop_list_dict
 
@@ -484,8 +549,12 @@ def iterate(self, file_path):
 
 class CommonCrawlWARCExtractor(DocumentExtractor):
 
-    def __init__(self, algorithm=JusTextExtractor()):
-        self._stop_lists = get_stop_list_dict()
+    def __init__(self, algorithm=JusTextExtractor(), stop_lists=None):
+        if stop_lists is not None:
+            self._stop_lists = stop_lists
+        else:
+            self._stop_lists = get_stop_list_dict()
+
         self.algorithm = algorithm
         super().__init__()
 
@@ -496,7 +565,7 @@ def extract(self, content):
             lang = lang_detect(html)
             text = None
             if lang in self._stop_lists:
-                text = self.algorithm.extract_text(html, self._stop_lists[lang])
+                text = self.algorithm.extract_text(html, self._stop_lists[lang], lang)
             if text is not None:
                 if len(text) > 0:
                     text = "\n\n".join(text)
@@ -512,6 +581,7 @@ def download_common_crawl(
     end_snapshot: str,
     output_type: Literal["jsonl", "parquet"] = "jsonl",
     algorithm=JusTextExtractor(),
+    stop_lists=None,
     news: bool = False,
     aws: bool = False,
     raw_download_dir: Optional[str] = None,
@@ -536,6 +606,10 @@ def download_common_crawl(
           • This is not used for the output file, but is used to check if an extracted output already exists.
       algorithm: The text extraction algorithm instance to use for HTML processing.
           • This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object.
+      stop_lists: A dictionary stop lists, where the keys are languages (e.g., "ENGLISH")
+          and the values are Python frozensets denoting the list of stop words for that language.
+          If None, it defaults to jusText's stop lists: https://github.com/miso-belica/jusText/tree/main/justext/stoplists,
+          with added Thai, Chinese, and Japanese support.
       news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
           • This also means snapshot identifiers should follow the 'YYYY-MM' format.
       aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd;
@@ -577,7 +651,7 @@ def download_common_crawl(
     expand_outdir_and_mkdir(raw_download_dir)
     downloader = CommonCrawlWARCDownloader(raw_download_dir, aws=aws)
     iterator = CommonCrawlWARCIterator()
-    extractor = CommonCrawlWARCExtractor(algorithm=algorithm)
+    extractor = CommonCrawlWARCExtractor(algorithm=algorithm, stop_lists=stop_lists)
 
     output_format = {
         "text": str,