Skip to content

Commit 7d06c12

Browse files
Merge branch 'main' into ML-593/quote-standardization
2 parents a4be1d6 + 4140f62 commit 7d06c12

File tree

5 files changed

+155
-93
lines changed

5 files changed

+155
-93
lines changed

CHANGELOG.md

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
## 0.16.10-dev0
2-
3-
### Enhancements
4-
5-
### Features
6-
7-
### Fixes
8-
91
## 0.16.9
102

113
### Enhancements
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# pyright: reportPrivateUsage=false
2+
3+
"""
4+
Script to render HTML from unstructured elements.
5+
NOTE: This script is not intended to be used as a module.
6+
NOTE: For now script is only intended to be used with elements generated with
7+
`partition_html(html_parser_version=v2)`
8+
TODO: It was noted that unstructured_elements_to_ontology func always returns a single page
9+
This script is using helper functions to handle multiple pages.
10+
"""
11+
12+
import argparse
13+
import logging
14+
import os
15+
import select
16+
import sys
17+
from collections import defaultdict
18+
from typing import List, Sequence
19+
20+
from bs4 import BeautifulSoup
21+
22+
from unstructured.documents import elements
23+
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
24+
from unstructured.staging.base import elements_from_json
25+
26+
# Configure logging
27+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
28+
logger = logging.getLogger(__name__)
29+
30+
31+
def extract_document_div(html_content: str) -> str:
32+
pos = html_content.find(">")
33+
if pos != -1:
34+
return html_content[: pos + 1]
35+
logger.error("No '>' found in the HTML content.")
36+
raise ValueError("No '>' found in the HTML content.")
37+
38+
39+
def extract_page_div(html_content: str) -> str:
40+
soup = BeautifulSoup(html_content, "html.parser")
41+
page_divs = soup.find_all("div", class_="Page")
42+
if len(page_divs) != 1:
43+
logger.error(
44+
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
45+
)
46+
raise ValueError("Expected exactly one <div> element with class 'Page'.")
47+
return str(page_divs[0])
48+
49+
50+
def fold_document_div(
51+
html_document_start: str, html_document_end: str, html_per_page: List[str]
52+
) -> str:
53+
html_document = html_document_start
54+
for page_html in html_per_page:
55+
html_document += page_html
56+
html_document += html_document_end
57+
return html_document
58+
59+
60+
def group_elements_by_page(
61+
unstructured_elements: Sequence[elements.Element],
62+
) -> Sequence[Sequence[elements.Element]]:
63+
pages_dict = defaultdict(list)
64+
65+
for element in unstructured_elements:
66+
page_number = element.metadata.page_number
67+
pages_dict[page_number].append(element)
68+
69+
pages_list = list(pages_dict.values())
70+
return pages_list
71+
72+
73+
def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
74+
"""Renders HTML from a JSON file with unstructured elements.
75+
76+
Args:
77+
filepath (str): path to JSON file with unstructured elements.
78+
79+
Returns:
80+
str: HTML content.
81+
"""
82+
if filepath is None and text is None:
83+
logger.error("Either filepath or text must be provided.")
84+
raise ValueError("Either filepath or text must be provided.")
85+
if filepath is not None and text is not None:
86+
logger.error("Both filepath and text cannot be provided.")
87+
raise ValueError("Both filepath and text cannot be provided.")
88+
if filepath is not None:
89+
logger.info("Rendering HTML from file: %s", filepath)
90+
else:
91+
logger.info("Rendering HTML from text.")
92+
93+
unstructured_elements = elements_from_json(filename=filepath, text=text)
94+
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
95+
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
96+
parsed_ontology_per_page = [
97+
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
98+
]
99+
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]
100+
101+
html_document_start = extract_document_div(html_per_page[0])
102+
html_document_end = "</div>"
103+
html_per_page = [extract_page_div(page) for page in html_per_page]
104+
105+
return fold_document_div(html_document_start, html_document_end, html_per_page)
106+
107+
108+
def _main():
109+
if os.getenv("PROCESS_FROM_STDIN") == "true":
110+
logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')")
111+
if select.select([sys.stdin], [], [], 0.1)[0]:
112+
content = sys.stdin.read()
113+
html = rendered_html(text=content)
114+
sys.stdout.write(html)
115+
else:
116+
logger.error("No input provided via STDIN. Exiting.")
117+
sys.exit(1)
118+
else:
119+
logger.info("Processing from command line arguments")
120+
parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.")
121+
parser.add_argument(
122+
"filepath", help="Path to JSON file with unstructured elements.", type=str
123+
)
124+
parser.add_argument(
125+
"--outdir",
126+
help="Path to directory where the rendered html will be stored.",
127+
type=str,
128+
default=None,
129+
nargs="?",
130+
)
131+
args = parser.parse_args()
132+
133+
html = rendered_html(filepath=args.filepath)
134+
if args.outdir is None:
135+
args.outdir = os.path.dirname(args.filepath)
136+
os.makedirs(args.outdir, exist_ok=True)
137+
outpath = os.path.join(
138+
args.outdir, os.path.basename(args.filepath).replace(".json", ".html")
139+
)
140+
with open(outpath, "w") as f:
141+
f.write(html)
142+
logger.info("HTML rendered and saved to: %s", outpath)
143+
144+
145+
if __name__ == "__main__":
146+
_main()

test_unstructured/nlp/test_tokenize.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99

1010
def test_nltk_packages_download_if_not_present():
11+
tokenize._download_nltk_packages_if_not_present.cache_clear()
1112
with patch.object(nltk, "find", side_effect=LookupError):
1213
with patch.object(tokenize, "download_nltk_packages") as mock_download:
1314
tokenize._download_nltk_packages_if_not_present()
@@ -16,6 +17,7 @@ def test_nltk_packages_download_if_not_present():
1617

1718

1819
def test_nltk_packages_do_not_download_if():
20+
tokenize._download_nltk_packages_if_not_present.cache_clear()
1921
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
2022
tokenize._download_nltk_packages_if_not_present()
2123

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.9-dev0" # pragma: no cover
1+
__version__ = "0.16.9" # pragma: no cover

unstructured/nlp/tokenize.py

Lines changed: 6 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
from __future__ import annotations
22

3-
import hashlib
43
import os
5-
import sys
6-
import tarfile
7-
import tempfile
8-
import urllib.request
94
from functools import lru_cache
105
from typing import Final, List, Tuple
116

@@ -16,86 +11,10 @@
1611

1712
CACHE_MAX_SIZE: Final[int] = 128
1813

19-
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
20-
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
21-
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
22-
23-
24-
# NOTE(robinson) - mimic default dir logic from NLTK
25-
# https://github.com/nltk/nltk/
26-
# blob/8c233dc585b91c7a0c58f96a9d99244a379740d5/nltk/downloader.py#L1046
27-
def get_nltk_data_dir() -> str | None:
28-
"""Locates the directory the nltk data will be saved too. The directory
29-
set by the NLTK environment variable takes highest precedence. Otherwise
30-
the default is determined by the rules indicated below. Returns None when
31-
the directory is not writable.
32-
33-
On Windows, the default download directory is
34-
``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
35-
directory containing Python, e.g. ``C:\\Python311``.
36-
37-
On all other platforms, the default directory is the first of
38-
the following which exists or which can be created with write
39-
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
40-
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
41-
"""
42-
# Check if we are on GAE where we cannot write into filesystem.
43-
if "APPENGINE_RUNTIME" in os.environ:
44-
return
45-
46-
# Check if we have sufficient permissions to install in a
47-
# variety of system-wide locations.
48-
for nltkdir in nltk.data.path:
49-
if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
50-
return nltkdir
51-
52-
# On Windows, use %APPDATA%
53-
if sys.platform == "win32" and "APPDATA" in os.environ:
54-
homedir = os.environ["APPDATA"]
55-
56-
# Otherwise, install in the user's home directory.
57-
else:
58-
homedir = os.path.expanduser("~/")
59-
if homedir == "~/":
60-
raise ValueError("Could not find a default download directory")
61-
62-
# NOTE(robinson) - NLTK appends nltk_data to the homedir. That's already
63-
# present in the tar file so we don't have to do that here.
64-
return homedir
65-
6614

6715
def download_nltk_packages():
68-
nltk_data_dir = get_nltk_data_dir()
69-
70-
if nltk_data_dir is None:
71-
raise OSError("NLTK data directory does not exist or is not writable.")
72-
73-
# Check if the path ends with "nltk_data" and remove it if it does
74-
if nltk_data_dir.endswith("nltk_data"):
75-
nltk_data_dir = os.path.dirname(nltk_data_dir)
76-
77-
def sha256_checksum(filename: str, block_size: int = 65536):
78-
sha256 = hashlib.sha256()
79-
with open(filename, "rb") as f:
80-
for block in iter(lambda: f.read(block_size), b""):
81-
sha256.update(block)
82-
return sha256.hexdigest()
83-
84-
with tempfile.TemporaryDirectory() as temp_dir_path:
85-
tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
86-
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
87-
88-
file_hash = sha256_checksum(tgz_file_path)
89-
if file_hash != NLTK_DATA_SHA256:
90-
os.remove(tgz_file_path)
91-
raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}")
92-
93-
# Extract the contents
94-
if not os.path.exists(nltk_data_dir):
95-
os.makedirs(nltk_data_dir)
96-
97-
with tarfile.open(tgz_file_path, "r:gz") as tar:
98-
tar.extractall(path=nltk_data_dir)
16+
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
17+
nltk.download("punkt_tab", quiet=True)
9918

10019

10120
def check_for_nltk_package(package_name: str, package_category: str) -> bool:
@@ -109,10 +28,13 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
10928
try:
11029
nltk.find(f"{package_category}/{package_name}", paths=paths)
11130
return True
112-
except LookupError:
31+
except (LookupError, OSError):
11332
return False
11433

11534

35+
# We cache this because we do not want to attempt
36+
# downloading the packages multiple times
37+
@lru_cache()
11638
def _download_nltk_packages_if_not_present():
11739
"""If required NLTK packages are not available, download them."""
11840

0 commit comments

Comments
 (0)