Merge branch 'main' into ML-593/quote-standardization

christinestraub · web-flow · commit 7d06c120dc0f · 2024-12-05T10:27:26.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,3 @@
-## 0.16.10-dev0
-
-### Enhancements
-
-### Features
-
-### Fixes
-
 ## 0.16.9
 
 ### Enhancements
diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py
@@ -0,0 +1,146 @@
+# pyright: reportPrivateUsage=false
+
+"""
+Script to render HTML from unstructured elements.
+NOTE: This script is not intended to be used as a module.
+NOTE: For now script is only intended to be used with elements generated with
+      `partition_html(html_parser_version=v2)`
+TODO: It was noted that unstructured_elements_to_ontology func always returns a single page
+      This script is using helper functions to handle multiple pages.
+"""
+
+import argparse
+import logging
+import os
+import select
+import sys
+from collections import defaultdict
+from typing import List, Sequence
+
+from bs4 import BeautifulSoup
+
+from unstructured.documents import elements
+from unstructured.partition.html.transformations import unstructured_elements_to_ontology
+from unstructured.staging.base import elements_from_json
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def extract_document_div(html_content: str) -> str:
+    pos = html_content.find(">")
+    if pos != -1:
+        return html_content[: pos + 1]
+    logger.error("No '>' found in the HTML content.")
+    raise ValueError("No '>' found in the HTML content.")
+
+
+def extract_page_div(html_content: str) -> str:
+    soup = BeautifulSoup(html_content, "html.parser")
+    page_divs = soup.find_all("div", class_="Page")
+    if len(page_divs) != 1:
+        logger.error(
+            "Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
+        )
+        raise ValueError("Expected exactly one <div> element with class 'Page'.")
+    return str(page_divs[0])
+
+
+def fold_document_div(
+    html_document_start: str, html_document_end: str, html_per_page: List[str]
+) -> str:
+    html_document = html_document_start
+    for page_html in html_per_page:
+        html_document += page_html
+    html_document += html_document_end
+    return html_document
+
+
+def group_elements_by_page(
+    unstructured_elements: Sequence[elements.Element],
+) -> Sequence[Sequence[elements.Element]]:
+    pages_dict = defaultdict(list)
+
+    for element in unstructured_elements:
+        page_number = element.metadata.page_number
+        pages_dict[page_number].append(element)
+
+    pages_list = list(pages_dict.values())
+    return pages_list
+
+
+def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
+    """Renders HTML from a JSON file with unstructured elements.
+
+    Args:
+        filepath (str): path to JSON file with unstructured elements.
+
+    Returns:
+        str: HTML content.
+    """
+    if filepath is None and text is None:
+        logger.error("Either filepath or text must be provided.")
+        raise ValueError("Either filepath or text must be provided.")
+    if filepath is not None and text is not None:
+        logger.error("Both filepath and text cannot be provided.")
+        raise ValueError("Both filepath and text cannot be provided.")
+    if filepath is not None:
+        logger.info("Rendering HTML from file: %s", filepath)
+    else:
+        logger.info("Rendering HTML from text.")
+
+    unstructured_elements = elements_from_json(filename=filepath, text=text)
+    unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
+    # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
+    parsed_ontology_per_page = [
+        unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
+    ]
+    html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]
+
+    html_document_start = extract_document_div(html_per_page[0])
+    html_document_end = "</div>"
+    html_per_page = [extract_page_div(page) for page in html_per_page]
+
+    return fold_document_div(html_document_start, html_document_end, html_per_page)
+
+
+def _main():
+    if os.getenv("PROCESS_FROM_STDIN") == "true":
+        logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')")
+        if select.select([sys.stdin], [], [], 0.1)[0]:
+            content = sys.stdin.read()
+            html = rendered_html(text=content)
+            sys.stdout.write(html)
+        else:
+            logger.error("No input provided via STDIN. Exiting.")
+            sys.exit(1)
+    else:
+        logger.info("Processing from command line arguments")
+        parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.")
+        parser.add_argument(
+            "filepath", help="Path to JSON file with unstructured elements.", type=str
+        )
+        parser.add_argument(
+            "--outdir",
+            help="Path to directory where the rendered html will be stored.",
+            type=str,
+            default=None,
+            nargs="?",
+        )
+        args = parser.parse_args()
+
+        html = rendered_html(filepath=args.filepath)
+        if args.outdir is None:
+            args.outdir = os.path.dirname(args.filepath)
+        os.makedirs(args.outdir, exist_ok=True)
+        outpath = os.path.join(
+            args.outdir, os.path.basename(args.filepath).replace(".json", ".html")
+        )
+        with open(outpath, "w") as f:
+            f.write(html)
+        logger.info("HTML rendered and saved to: %s", outpath)
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
@@ -8,6 +8,7 @@
 
 
 def test_nltk_packages_download_if_not_present():
+    tokenize._download_nltk_packages_if_not_present.cache_clear()
     with patch.object(nltk, "find", side_effect=LookupError):
         with patch.object(tokenize, "download_nltk_packages") as mock_download:
             tokenize._download_nltk_packages_if_not_present()
@@ -16,6 +17,7 @@ def test_nltk_packages_download_if_not_present():
 
 
 def test_nltk_packages_do_not_download_if():
+    tokenize._download_nltk_packages_if_not_present.cache_clear()
     with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
         tokenize._download_nltk_packages_if_not_present()
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.9-dev0"  # pragma: no cover
+__version__ = "0.16.9"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -1,11 +1,6 @@
 from __future__ import annotations
 
-import hashlib
 import os
-import sys
-import tarfile
-import tempfile
-import urllib.request
 from functools import lru_cache
 from typing import Final, List, Tuple
 
@@ -16,86 +11,10 @@
 
 CACHE_MAX_SIZE: Final[int] = 128
 
-NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
-NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
-NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
-
-
-# NOTE(robinson) - mimic default dir logic from NLTK
-# https://github.com/nltk/nltk/
-# 	blob/8c233dc585b91c7a0c58f96a9d99244a379740d5/nltk/downloader.py#L1046
-def get_nltk_data_dir() -> str | None:
-    """Locates the directory the nltk data will be saved too. The directory
-    set by the NLTK environment variable takes highest precedence. Otherwise
-    the default is determined by the rules indicated below. Returns None when
-    the directory is not writable.
-
-        On Windows, the default download directory is
-        ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
-        directory containing Python, e.g. ``C:\\Python311``.
-
-        On all other platforms, the default directory is the first of
-        the following which exists or which can be created with write
-        permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
-        ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
-    """
-    # Check if we are on GAE where we cannot write into filesystem.
-    if "APPENGINE_RUNTIME" in os.environ:
-        return
-
-    # Check if we have sufficient permissions to install in a
-    # variety of system-wide locations.
-    for nltkdir in nltk.data.path:
-        if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
-            return nltkdir
-
-    # On Windows, use %APPDATA%
-    if sys.platform == "win32" and "APPDATA" in os.environ:
-        homedir = os.environ["APPDATA"]
-
-    # Otherwise, install in the user's home directory.
-    else:
-        homedir = os.path.expanduser("~/")
-        if homedir == "~/":
-            raise ValueError("Could not find a default download directory")
-
-    # NOTE(robinson) - NLTK appends nltk_data to the homedir. That's already
-    # present in the tar file so we don't have to do that here.
-    return homedir
-
 
 def download_nltk_packages():
-    nltk_data_dir = get_nltk_data_dir()
-
-    if nltk_data_dir is None:
-        raise OSError("NLTK data directory does not exist or is not writable.")
-
-    # Check if the path ends with "nltk_data" and remove it if it does
-    if nltk_data_dir.endswith("nltk_data"):
-        nltk_data_dir = os.path.dirname(nltk_data_dir)
-
-    def sha256_checksum(filename: str, block_size: int = 65536):
-        sha256 = hashlib.sha256()
-        with open(filename, "rb") as f:
-            for block in iter(lambda: f.read(block_size), b""):
-                sha256.update(block)
-        return sha256.hexdigest()
-
-    with tempfile.TemporaryDirectory() as temp_dir_path:
-        tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
-        urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
-
-        file_hash = sha256_checksum(tgz_file_path)
-        if file_hash != NLTK_DATA_SHA256:
-            os.remove(tgz_file_path)
-            raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}")
-
-        # Extract the contents
-        if not os.path.exists(nltk_data_dir):
-            os.makedirs(nltk_data_dir)
-
-        with tarfile.open(tgz_file_path, "r:gz") as tar:
-            tar.extractall(path=nltk_data_dir)
+    nltk.download("averaged_perceptron_tagger_eng", quiet=True)
+    nltk.download("punkt_tab", quiet=True)
 
 
 def check_for_nltk_package(package_name: str, package_category: str) -> bool:
@@ -109,10 +28,13 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
     try:
         nltk.find(f"{package_category}/{package_name}", paths=paths)
         return True
-    except LookupError:
+    except (LookupError, OSError):
         return False
 
 
+# We cache this because we do not want to attempt
+# downloading the packages multiple times
+@lru_cache()
 def _download_nltk_packages_if_not_present():
     """If required NLTK packages are not available, download them."""
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.9-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.16.9" # pragma: no cover`