Skip to content

Commit c53ce11

Browse files
authored
fix: enable partition_html to grab content outside of <article> tags (#772)
* optionally dont assemble articles * add test for content outside of articles * pass kwargs in partition * changelog and version * update default to False * bump version for release * back to dev version to get another fix in the release
1 parent feaf1cb commit c53ce11

File tree

7 files changed

+92
-18
lines changed

7 files changed

+92
-18
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212

1313
### Fixes
1414

15+
* Adds an `html_assemble_articles` kwarg to `partition_html` to enable users to capture
16+
control whether content outside of `<article>` tags is captured when
17+
`<article>` tags are present.
1518
* Check for the `xml` attribute on `element` before looking for pagebreaks in `partition_docx`.
1619

1720
## 0.7.6

docs/source/bricks.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,15 @@ to disable SSL verification in the request.
329329
elements = partition_html(url="https://python.org/", ssl_verify=False)
330330
331331
332+
333+
If you website contains news articles, it can be helpful to only grab content that appears in
334+
between the ``<article>`` tags, if the site uses that convention.
335+
To activate this behavior, you can set ``html_assemble_articles=True``.
336+
If ``html_assemble_articles`` is ``True``, each ``<article>`` tag will be treated as a a page.
337+
If ``html_assemble_articles`` is ``True`` and no ``<article>`` tags are present, the behavior
338+
is the same as ``html_assemble_articles=False``.
339+
340+
332341
``partition_image``
333342
---------------------
334343

test_unstructured/partition/test_html_partition.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,20 @@ def test_emoji_appears_with_emoji_utf8_code():
246246
html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
247247
elements = partition_html(text=html_text)
248248
assert elements[0] == Title("Hello 😀")
249+
250+
251+
def test_partition_html_can_turn_off_assemble_articles():
252+
html_text = """<html>
253+
<article>
254+
<h1>Some important stuff is going on!</h1>
255+
<p>Here is a description of that stuff</p>
256+
</article>
257+
<article>
258+
<h1>Some other important stuff is going on!</h1>
259+
<p>Here is a description of that stuff</p>
260+
</article>
261+
<h4>This is outside of the article.</h4>
262+
</html>
263+
"""
264+
elements = partition_html(text=html_text, html_assemble_articles=False)
265+
assert elements[-1] == Title("This is outside of the article.")

unstructured/documents/html.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
Text,
2121
Title,
2222
)
23-
from unstructured.documents.xml import XMLDocument
23+
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
2424
from unstructured.logger import logger
2525
from unstructured.partition.text_type import (
2626
is_bulleted_text,
@@ -90,6 +90,15 @@ class HTMLDocument(XMLDocument):
9090
"""Class for handling HTML documents. Uses rules based parsing to identify sections
9191
of interest within the document."""
9292

93+
def __init__(
94+
self,
95+
stylesheet: Optional[str] = None,
96+
parser: VALID_PARSERS = None,
97+
assemble_articles: bool = True,
98+
):
99+
self.assembled_articles = assemble_articles
100+
super().__init__(stylesheet=stylesheet, parser=parser)
101+
93102
def _read(self) -> List[Page]:
94103
"""Reads and structures and HTML document. If present, looks for article tags.
95104
if there are multiple article sections present, a page break is inserted between them.
@@ -101,7 +110,7 @@ def _read(self) -> List[Page]:
101110
etree.strip_elements(self.document_tree, ["script"])
102111
root = _find_main(self.document_tree)
103112

104-
articles = _find_articles(root)
113+
articles = _find_articles(root, assemble_articles=self.assembled_articles)
105114
page_number = 0
106115
page = Page(number=page_number)
107116
for article in articles:
@@ -407,9 +416,12 @@ def _find_main(root: etree.Element) -> etree.Element:
407416
return main_tag_elem if main_tag_elem is not None else root
408417

409418

410-
def _find_articles(root: etree.Element) -> List[etree.Element]:
419+
def _find_articles(root: etree.Element, assemble_articles: bool = True) -> List[etree.Element]:
411420
"""Tries to break the HTML document into distinct articles. If there are no article
412421
tags, the entire document is returned as a single item list."""
422+
if assemble_articles is False:
423+
return root
424+
413425
articles = root.findall(".//article")
414426
if len(articles) == 0:
415427
# NOTE(robinson) - ref: https://schema.org/Article

unstructured/documents/xml.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,16 @@ def _read_xml(self, content):
9292
return self.document_tree
9393

9494
@classmethod
95-
def from_string(cls, text: str, parser: VALID_PARSERS = None, stylesheet: Optional[str] = None):
95+
def from_string(
96+
cls,
97+
text: str,
98+
parser: VALID_PARSERS = None,
99+
stylesheet: Optional[str] = None,
100+
**kwargs,
101+
):
96102
"""Supports reading in an XML file as a raw string rather than as a file."""
97103
logger.info("Reading document from string ...")
98-
doc = cls(parser=parser, stylesheet=stylesheet)
104+
doc = cls(parser=parser, stylesheet=stylesheet, **kwargs)
99105
doc._read_xml(text)
100106
return doc
101107

@@ -106,6 +112,7 @@ def from_file(
106112
parser: VALID_PARSERS = None,
107113
stylesheet: Optional[str] = None,
108114
encoding: Optional[str] = None,
115+
**kwargs,
109116
):
110117
_, content = read_txt_file(filename=filename, encoding=encoding)
111-
return cls.from_string(content, parser=parser, stylesheet=stylesheet)
118+
return cls.from_string(content, parser=parser, stylesheet=stylesheet, **kwargs)

unstructured/partition/auto.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def partition(
5050
pdf_infer_table_structure: bool = False,
5151
xml_keep_tags: bool = False,
5252
data_source_metadata: Optional[DataSourceMetadata] = None,
53+
**kwargs,
5354
):
5455
"""Partitions a document into its constituent elements. Will use libmagic to determine
5556
the file's type and route it to the appropriate partitioning function. Applies the default
@@ -121,46 +122,51 @@ def partition(
121122
file.seek(0)
122123

123124
if filetype == FileType.DOC:
124-
elements = partition_doc(filename=filename, file=file)
125+
elements = partition_doc(filename=filename, file=file, **kwargs)
125126
elif filetype == FileType.DOCX:
126-
elements = partition_docx(filename=filename, file=file)
127+
elements = partition_docx(filename=filename, file=file, **kwargs)
127128
elif filetype == FileType.ODT:
128-
elements = partition_odt(filename=filename, file=file)
129+
elements = partition_odt(filename=filename, file=file, **kwargs)
129130
elif filetype == FileType.EML:
130-
elements = partition_email(filename=filename, file=file, encoding=encoding)
131+
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
131132
elif filetype == FileType.MSG:
132-
elements = partition_msg(filename=filename, file=file)
133+
elements = partition_msg(filename=filename, file=file, **kwargs)
133134
elif filetype == FileType.HTML:
134135
elements = partition_html(
135136
filename=filename,
136137
file=file,
137138
include_page_breaks=include_page_breaks,
138139
encoding=encoding,
140+
**kwargs,
139141
)
140142
elif filetype == FileType.XML:
141143
elements = partition_xml(
142144
filename=filename,
143145
file=file,
144146
encoding=encoding,
145147
xml_keep_tags=xml_keep_tags,
148+
**kwargs,
146149
)
147150
elif filetype == FileType.EPUB:
148151
elements = partition_epub(
149152
filename=filename,
150153
file=file,
151154
include_page_breaks=include_page_breaks,
155+
**kwargs,
152156
)
153157
elif filetype == FileType.RST:
154158
elements = partition_rst(
155159
filename=filename,
156160
file=file,
157161
include_page_breaks=include_page_breaks,
162+
**kwargs,
158163
)
159164
elif filetype == FileType.MD:
160165
elements = partition_md(
161166
filename=filename,
162167
file=file,
163168
include_page_breaks=include_page_breaks,
169+
**kwargs,
164170
)
165171
elif filetype == FileType.PDF:
166172
elements = partition_pdf(
@@ -171,6 +177,7 @@ def partition(
171177
infer_table_structure=pdf_infer_table_structure,
172178
strategy=strategy,
173179
ocr_languages=ocr_languages,
180+
**kwargs,
174181
)
175182
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
176183
elements = partition_image(
@@ -180,40 +187,45 @@ def partition(
180187
include_page_breaks=include_page_breaks,
181188
strategy=strategy,
182189
ocr_languages=ocr_languages,
190+
**kwargs,
183191
)
184192
elif filetype == FileType.TXT:
185193
elements = partition_text(
186194
filename=filename,
187195
file=file,
188196
encoding=encoding,
189197
paragraph_grouper=paragraph_grouper,
198+
**kwargs,
190199
)
191200
elif filetype == FileType.RTF:
192201
elements = partition_rtf(
193202
filename=filename,
194203
file=file,
195204
include_page_breaks=include_page_breaks,
205+
**kwargs,
196206
)
197207
elif filetype == FileType.PPT:
198208
elements = partition_ppt(
199209
filename=filename,
200210
file=file,
201211
include_page_breaks=include_page_breaks,
212+
**kwargs,
202213
)
203214
elif filetype == FileType.PPTX:
204215
elements = partition_pptx(
205216
filename=filename,
206217
file=file,
207218
include_page_breaks=include_page_breaks,
219+
**kwargs,
208220
)
209221
elif filetype == FileType.JSON:
210-
elements = partition_json(filename=filename, file=file)
222+
elements = partition_json(filename=filename, file=file, **kwargs)
211223
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
212-
elements = partition_xlsx(filename=filename, file=file)
224+
elements = partition_xlsx(filename=filename, file=file, **kwargs)
213225
elif filetype == FileType.CSV:
214-
elements = partition_csv(filename=filename, file=file)
226+
elements = partition_csv(filename=filename, file=file, **kwargs)
215227
elif filetype == FileType.TSV:
216-
elements = partition_tsv(filename=filename, file=file)
228+
elements = partition_tsv(filename=filename, file=file, **kwargs)
217229
elif filetype == FileType.EMPTY:
218230
elements = []
219231
else:

unstructured/partition/html.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def partition_html(
3030
headers: Dict[str, str] = {},
3131
ssl_verify: bool = True,
3232
parser: VALID_PARSERS = None,
33+
html_assemble_articles: bool = False,
3334
**kwargs,
3435
) -> List[Element]:
3536
"""Partitions an HTML document into its constituent elements.
@@ -66,15 +67,28 @@ def partition_html(
6667
exactly_one(filename=filename, file=file, text=text, url=url)
6768

6869
if filename is not None:
69-
document = HTMLDocument.from_file(filename, parser=parser, encoding=encoding)
70+
document = HTMLDocument.from_file(
71+
filename,
72+
parser=parser,
73+
encoding=encoding,
74+
assemble_articles=html_assemble_articles,
75+
)
7076

7177
elif file is not None:
7278
_, file_text = read_txt_file(file=file, encoding=encoding)
73-
document = HTMLDocument.from_string(file_text, parser=parser)
79+
document = HTMLDocument.from_string(
80+
file_text,
81+
parser=parser,
82+
assemble_articles=html_assemble_articles,
83+
)
7484

7585
elif text is not None:
7686
_text: str = str(text)
77-
document = HTMLDocument.from_string(_text, parser=parser)
87+
document = HTMLDocument.from_string(
88+
_text,
89+
parser=parser,
90+
assemble_articles=html_assemble_articles,
91+
)
7892

7993
elif url is not None:
8094
response = requests.get(url, headers=headers, verify=ssl_verify)

0 commit comments

Comments
 (0)