Skip to content

Commit 4140f62

Browse files
add script to render html from unstructured elements (#3799)
Script to render HTML from unstructured elements. NOTE: This script is not intended to be used as a module. NOTE: This script is only intended to be used with outputs with non-empty `metadata.text_as_html`. TODO: It was noted that unstructured_elements_to_ontology func always returns a single page This script is using helper functions to handle multiple pages. I am not sure if this was intended, or it is a bug - if it is a bug it would require bit longer debugging - to make it usable fast I used workarounds. Usage: test with any outputs with non-empty `metadata.text_as_html`. Example files attached. `[Example-Bill-of-Lading-Waste.docx.pdf.json](https://github.com/user-attachments/files/17922898/Example-Bill-of-Lading-Waste.docx.pdf.json)` [Breast_Cancer1-5.pdf.json](https://github.com/user-attachments/files/17922899/Breast_Cancer1-5.pdf.json)
1 parent 0fb814d commit 4140f62

File tree

1 file changed

+146
-0
lines changed

1 file changed

+146
-0
lines changed
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# pyright: reportPrivateUsage=false
2+
3+
"""
4+
Script to render HTML from unstructured elements.
5+
NOTE: This script is not intended to be used as a module.
6+
NOTE: For now script is only intended to be used with elements generated with
7+
`partition_html(html_parser_version=v2)`
8+
TODO: It was noted that unstructured_elements_to_ontology func always returns a single page
9+
This script is using helper functions to handle multiple pages.
10+
"""
11+
12+
import argparse
13+
import logging
14+
import os
15+
import select
16+
import sys
17+
from collections import defaultdict
18+
from typing import List, Sequence
19+
20+
from bs4 import BeautifulSoup
21+
22+
from unstructured.documents import elements
23+
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
24+
from unstructured.staging.base import elements_from_json
25+
26+
# Configure logging
27+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
28+
logger = logging.getLogger(__name__)
29+
30+
31+
def extract_document_div(html_content: str) -> str:
32+
pos = html_content.find(">")
33+
if pos != -1:
34+
return html_content[: pos + 1]
35+
logger.error("No '>' found in the HTML content.")
36+
raise ValueError("No '>' found in the HTML content.")
37+
38+
39+
def extract_page_div(html_content: str) -> str:
40+
soup = BeautifulSoup(html_content, "html.parser")
41+
page_divs = soup.find_all("div", class_="Page")
42+
if len(page_divs) != 1:
43+
logger.error(
44+
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
45+
)
46+
raise ValueError("Expected exactly one <div> element with class 'Page'.")
47+
return str(page_divs[0])
48+
49+
50+
def fold_document_div(
51+
html_document_start: str, html_document_end: str, html_per_page: List[str]
52+
) -> str:
53+
html_document = html_document_start
54+
for page_html in html_per_page:
55+
html_document += page_html
56+
html_document += html_document_end
57+
return html_document
58+
59+
60+
def group_elements_by_page(
61+
unstructured_elements: Sequence[elements.Element],
62+
) -> Sequence[Sequence[elements.Element]]:
63+
pages_dict = defaultdict(list)
64+
65+
for element in unstructured_elements:
66+
page_number = element.metadata.page_number
67+
pages_dict[page_number].append(element)
68+
69+
pages_list = list(pages_dict.values())
70+
return pages_list
71+
72+
73+
def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
74+
"""Renders HTML from a JSON file with unstructured elements.
75+
76+
Args:
77+
filepath (str): path to JSON file with unstructured elements.
78+
79+
Returns:
80+
str: HTML content.
81+
"""
82+
if filepath is None and text is None:
83+
logger.error("Either filepath or text must be provided.")
84+
raise ValueError("Either filepath or text must be provided.")
85+
if filepath is not None and text is not None:
86+
logger.error("Both filepath and text cannot be provided.")
87+
raise ValueError("Both filepath and text cannot be provided.")
88+
if filepath is not None:
89+
logger.info("Rendering HTML from file: %s", filepath)
90+
else:
91+
logger.info("Rendering HTML from text.")
92+
93+
unstructured_elements = elements_from_json(filename=filepath, text=text)
94+
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
95+
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
96+
parsed_ontology_per_page = [
97+
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
98+
]
99+
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]
100+
101+
html_document_start = extract_document_div(html_per_page[0])
102+
html_document_end = "</div>"
103+
html_per_page = [extract_page_div(page) for page in html_per_page]
104+
105+
return fold_document_div(html_document_start, html_document_end, html_per_page)
106+
107+
108+
def _main():
109+
if os.getenv("PROCESS_FROM_STDIN") == "true":
110+
logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')")
111+
if select.select([sys.stdin], [], [], 0.1)[0]:
112+
content = sys.stdin.read()
113+
html = rendered_html(text=content)
114+
sys.stdout.write(html)
115+
else:
116+
logger.error("No input provided via STDIN. Exiting.")
117+
sys.exit(1)
118+
else:
119+
logger.info("Processing from command line arguments")
120+
parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.")
121+
parser.add_argument(
122+
"filepath", help="Path to JSON file with unstructured elements.", type=str
123+
)
124+
parser.add_argument(
125+
"--outdir",
126+
help="Path to directory where the rendered html will be stored.",
127+
type=str,
128+
default=None,
129+
nargs="?",
130+
)
131+
args = parser.parse_args()
132+
133+
html = rendered_html(filepath=args.filepath)
134+
if args.outdir is None:
135+
args.outdir = os.path.dirname(args.filepath)
136+
os.makedirs(args.outdir, exist_ok=True)
137+
outpath = os.path.join(
138+
args.outdir, os.path.basename(args.filepath).replace(".json", ".html")
139+
)
140+
with open(outpath, "w") as f:
141+
f.write(html)
142+
logger.info("HTML rendered and saved to: %s", outpath)
143+
144+
145+
if __name__ == "__main__":
146+
_main()

0 commit comments

Comments
 (0)