Skip to content

Commit 85ecdab

Browse files
authored
Add text as html to orig elements chunks (#3779)
This simplest solution doesn't drop HTML from metadata when merging Elements from HTML input. We still need to address how to handle nested elements, and if we want to have `LayoutElements` in the metadata of Composite Elements, a unit test showing the current behavior. Note: metadata still contains `orig_elements` which has all the metadata.
1 parent e1babf0 commit 85ecdab

File tree

4 files changed

+97
-1
lines changed

4 files changed

+97
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
### Features
88

99
### Fixes
10+
- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output
1011

1112
## 0.16.5
1213

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from functools import partial
2+
3+
import pytest
4+
5+
from unstructured.chunking.basic import chunk_elements
6+
from unstructured.chunking.title import chunk_by_title
7+
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title
8+
9+
10+
@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)])
11+
def chunking_fn(request):
12+
return request.param
13+
14+
15+
def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn):
16+
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
17+
metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>'
18+
metadata_3 = (
19+
'<form class="Form" id="3"> '
20+
'<label class="FormField" for="company-name" id="4">Form field name </label>'
21+
'<input class="FormFieldValue" id="5" value="Example value" />'
22+
"</form>"
23+
)
24+
combined_metadata = " ".join([metadata_1, metadata_2, metadata_3])
25+
26+
elements = [
27+
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
28+
Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)),
29+
Text(
30+
text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3)
31+
),
32+
]
33+
chunks = chunking_fn(elements)
34+
assert len(chunks) == 1
35+
assert chunks[0].metadata.text_as_html == combined_metadata
36+
37+
38+
def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn):
39+
"""
40+
Ground truth
41+
<Document>
42+
<Page>
43+
<Section>
44+
<p>First</p>
45+
<p>Second</p>
46+
</Section>
47+
</Page>
48+
</Document>
49+
Elements: Document, Page, Section, Paragraph, Paragraph
50+
Chunk 1: Document, Page, Section, Paragraph
51+
52+
Chunk 2:
53+
Paragraph
54+
"""
55+
56+
metadata_1 = '<div class="Section" id="1" />'
57+
metadata_2 = '<p class="Paragraph" id="2">First </p>'
58+
metadata_3 = '<p class="Paragraph" id="3">Second </p>'
59+
60+
elements = [
61+
Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)),
62+
NarrativeText(
63+
text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1")
64+
),
65+
NarrativeText(
66+
text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1")
67+
),
68+
]
69+
chunks = chunking_fn(elements, max_characters=6)
70+
assert len(chunks) == 2
71+
assert chunks[0].text == "First"
72+
assert chunks[1].text == "Second"
73+
74+
assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2
75+
assert chunks[1].metadata.text_as_html == metadata_3
76+
77+
78+
def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn):
79+
"""Mimic behaviour of elements with non-html metadata"""
80+
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
81+
elements = [
82+
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
83+
]
84+
chunks = chunking_fn(elements, max_characters=3)
85+
assert len(chunks) == 2
86+
87+
assert chunks[0].text == "Hea"
88+
assert chunks[1].text == "der"
89+
assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
90+
assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'

unstructured/chunking/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,8 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:
774774
# -- Python 3.7+ maintains dict insertion order --
775775
ordered_unique_keys = {key: None for val_list in values for key in val_list}
776776
yield field_name, list(ordered_unique_keys.keys())
777+
elif strategy is CS.STRING_CONCATENATE:
778+
yield field_name, " ".join(val.strip() for val in values)
777779
elif strategy is CS.DROP:
778780
continue
779781
else: # pragma: no cover

unstructured/documents/elements.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum):
458458
FIRST = "first"
459459
"""Use the first value encountered, omit if not present in any elements."""
460460

461+
STRING_CONCATENATE = "string_concatenate"
462+
"""Combine the values of this field across elements. Only suitable for fields of `str` type."""
463+
461464
LIST_CONCATENATE = "LIST_CONCATENATE"
462465
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""
463466

@@ -507,7 +510,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
507510
"sent_to": cls.FIRST,
508511
"signature": cls.FIRST,
509512
"subject": cls.FIRST,
510-
"text_as_html": cls.FIRST, # -- only occurs in Table --
513+
"text_as_html": cls.STRING_CONCATENATE,
511514
"table_as_cells": cls.FIRST, # -- only occurs in Table --
512515
"url": cls.FIRST,
513516
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --

0 commit comments

Comments
 (0)