Skip to content

Commit eb1b294

Browse files
authored
ML-405/ML-427 - OntologyElement improvements (#3758)
- the "value" attribute from <input/> tag will be taken into account and processed as "text" in ontology - the tables will now be parsed without any ids and classes - we have different reasons behind that, for example, embeddings with ids and classes can lose some semantic value. Also, more tokens = more expensive LLM call - cleaned to_html, created to_text for OntologyElement
1 parent d0be115 commit eb1b294

File tree

7 files changed

+98
-56
lines changed

7 files changed

+98
-56
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
## 0.16.4-dev0
1+
## 0.16.4-dev1
22

33
### Enhancements
44

5+
* **`value` attribute in `<input/>` element is parsed to `OntologyElement.text` in ontology**
6+
* **`id` and `class` attributes removed from Table subtags in HTML partitioning**
7+
* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`**
8+
59
### Features
610

711
### Fixes

test_unstructured/documents/unstructured_json_output/example.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
"parent_id": "3a6b156a81764e17be128264241f8136",
5757
"text_as_html": "<form class=\"Form\" id=\"637c2f6935fb4353a5f73025ce04619d\"> <label class=\"FormField\" for=\"company-name\" id=\"50027cccbe1948c9853ce0de37b635c2\">From field name </label><input class=\"FormFieldValue\" id=\"0032242af75c4b37984ea7fea9aac74c\" value=\"Example value\" /></form>"
5858
},
59-
"text": "From field name",
59+
"text": "From field name Example value",
6060
"type": "UncategorizedText"
6161
},
6262
{
@@ -78,9 +78,9 @@
7878
"filename": "example.pdf",
7979
"page_number": 1,
8080
"parent_id": "592422373ed741b68a077e2003f8ed81",
81-
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead class=\"TableHeader\" id=\"50a5548a87e84024af590b3d2830d140\"> <tr class=\"TableRow\" id=\"5e473d7742474412be72dc4e2c45bd4a\"> <th class=\"TableCellHeader\" id=\"01800309aa42411c98ae30f85b23f399\">Description </th><th class=\"TableCellHeader\" id=\"c2765b63d08946a2851955e79e301de4\">Row header </th></tr></thead><tbody class=\"TableBody\" id=\"e0a9a8ffdd7148ad8b4a274b073d340a\"> <tr class=\"TableRow\" id=\"77e829974632455191330b0b8545d1e3\"> <td class=\"TableCell\" id=\"7fee12d4c5554b7da778d6f8fdec8a57\">Value description </td><td class=\"TableCell\" id=\"5a7a33b0c57b4eb881a35bce9f87c831\"> <span class=\"Currency\" id=\"87220f9d62c3482e92e7de72a26869cd\">50 $ </span><span class=\"Measurement\" id=\"0095b9efb90a4cca991e73547c7165f1\">(1.32 %) </span></td></tr></tbody></table>"
81+
"text_as_html": "<table class=\"Table\" id=\"dc3792d4422e444f90876b56d0cfb20d\"> <thead> <tr> <th>Description</th><th>Row header</th></tr></thead><tbody> <tr> <td>Value description</td><td>50 $ (1.32 %)</td></tr></tbody></table>"
8282
},
83-
"text": "Description Row header Value description 50 $ (1.32 %)",
83+
"text": "Description Row header Value description 50 $ (1.32 %)",
8484
"type": "Table"
8585
},
8686
{

test_unstructured/partition/html/test_html_to_ontology_parsing.py

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -356,12 +356,12 @@ def test_broken_cell_is_not_raising_error():
356356
"""
357357
<div class="Page">
358358
<table class="Table">
359-
<tbody class="TableBody">
360-
<tr class="TableRow">
361-
<td class="TableCell" tablecell&quot;="">
359+
<tbody>
360+
<tr>
361+
<td tablecell&quot;="">
362362
83.64 GiB
363363
</td>
364-
<th class="TableCellHeader" rowspan="2">
364+
<th rowspan="2">
365365
Fair Value
366366
</th>
367367
</tr>
@@ -406,12 +406,12 @@ def test_table():
406406
"""
407407
<div class="Page">
408408
<table class="Table">
409-
<tbody class="TableBody">
410-
<tr class="TableRow">
411-
<td class="TableCell">
409+
<tbody>
410+
<tr>
411+
<td>
412412
Fair Value1
413413
</td>
414-
<th class="TableCellHeader" rowspan="2">
414+
<th rowspan="2">
415415
Fair Value2
416416
</th>
417417
</tr>
@@ -467,24 +467,20 @@ def test_table_and_time():
467467
"""
468468
<div class="Page">
469469
<table class="Table">
470-
<thead class='TableHeader'>
471-
<tr class="TableRow">
472-
<th class="TableCellHeader" colspan="6">
470+
<thead>
471+
<tr>
472+
<th colspan="6">
473473
Carrying Value
474474
</th>
475475
</tr>
476476
</thead>
477-
<tbody class='TableBody'>
478-
<tr class="TableRow">
479-
<td class="TableCell" colspan="5">
480-
<time class="CalendarDate">
477+
<tbody>
478+
<tr>
479+
<td colspan="5">
481480
June 30, 2023
482-
</time>
483481
</td>
484-
<td class="TableCell">
485-
<span class="Currency">
482+
<td>
486483
$—
487-
</span>
488484
</td>
489485
</tr>
490486
</tbody>
@@ -594,3 +590,18 @@ def test_text_is_wrapped_inside_layout_element():
594590
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
595591

596592
assert parsed_ontology == expected_html
593+
594+
595+
def test_text_in_form_field_value():
596+
# language=HTML
597+
input_html = """
598+
<div class="Page">
599+
<input class="FormFieldValue" value="Random Input Value"/>
600+
</div>
601+
"""
602+
page = parse_html_to_ontology(input_html)
603+
604+
assert len(page.children) == 1
605+
form_field_value = page.children[0]
606+
assert form_field_value.text == ""
607+
assert form_field_value.to_text() == "Random Input Value"

test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -314,24 +314,21 @@ def test_table():
314314
unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html(
315315
html_as_str
316316
)
317-
expected_html = indent_html(html_as_str, html_parser="html.parser")
318-
parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser")
319317

320-
assert expected_html == parsed_html
321318
expected_elements = _page_elements + [
322319
Table(
323320
text="Fair Value1 Fair Value2",
324321
detection_origin="vlm_partitioner",
325322
element_id="2",
326323
metadata=ElementMetadata(
327324
text_as_html='<table class="Table" id="2"> '
328-
'<tbody class="TableBody" id="3"> '
329-
'<tr class="TableRow" id="4"> '
330-
'<td class="TableCell" id="5">'
331-
"Fair Value1 "
325+
"<tbody> "
326+
"<tr> "
327+
"<td>"
328+
"Fair Value1"
332329
"</td>"
333-
'<th class="TableCellHeader" rowspan="2" id="6">'
334-
"Fair Value2 "
330+
'<th rowspan="2">'
331+
"Fair Value2"
335332
"</th></tr></tbody></table>",
336333
parent_id="1",
337334
),

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.4-dev0" # pragma: no cover
1+
__version__ = "0.16.4-dev1" # pragma: no cover

unstructured/documents/ontology.py

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from enum import Enum
2121
from typing import List, Optional
2222

23+
from bs4 import BeautifulSoup
2324
from pydantic import BaseModel, Field
2425

2526

@@ -75,32 +76,39 @@ def generate_unique_id() -> str:
7576

7677
def to_html(self, add_children=True) -> str:
7778
additional_attrs = copy(self.additional_attributes)
78-
if "class" in additional_attrs:
79-
del additional_attrs["class"]
80-
81-
# TODO(Pluto) Add support for multiple classes
82-
attrs = " ".join(
83-
f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items()
84-
)
79+
additional_attrs.pop("class", None)
8580

81+
attr_str = self._construct_attribute_string(additional_attrs)
8682
class_attr = f'class="{self.css_class_name}"' if self.css_class_name else ""
87-
attr_str = f"{class_attr} {attrs}".strip()
8883

89-
children_html = (
90-
("" if not self.children else "".join(child.to_html() for child in self.children))
91-
if add_children
92-
else ""
84+
combined_attr_str = f"{class_attr} {attr_str}".strip()
85+
86+
children_html = self._generate_children_html(add_children)
87+
88+
result_html = self._generate_final_html(combined_attr_str, children_html)
89+
90+
return result_html
91+
92+
def to_text(self, add_children=True) -> str:
93+
return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings)
94+
95+
def _construct_attribute_string(self, attributes: dict) -> str:
96+
return " ".join(
97+
f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items()
9398
)
94-
text = "" if not self.text else self.text
99+
100+
def _generate_children_html(self, add_children: bool) -> str:
101+
if not add_children or not self.children:
102+
return ""
103+
return "".join(child.to_html() for child in self.children)
104+
105+
def _generate_final_html(self, attr_str: str, children_html: str) -> str:
106+
text = self.text or ""
95107

96108
if text or children_html:
97-
# This is either one or another, never both
98-
result_html = (
99-
f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
100-
)
109+
return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>"
101110
else:
102-
result_html = f"<{self.html_tag_name} {attr_str} />"
103-
return result_html
111+
return f"<{self.html_tag_name} {attr_str} />"
104112

105113
@property
106114
def id(self) -> str | None:
@@ -254,6 +262,18 @@ class Table(OntologyElement):
254262
elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True)
255263
allowed_tags: List[str] = Field(["table"], frozen=True)
256264

265+
def to_html(self, add_children=True) -> str:
266+
soup = BeautifulSoup(super().to_html(add_children), "html.parser")
267+
268+
for tag in soup.find_all(True):
269+
if tag.name != "table":
270+
tag.attrs.pop("class", None)
271+
tag.attrs.pop("id", None)
272+
if tag.name in ["td", "th"]:
273+
tag.string = " ".join(tag.stripped_strings)
274+
275+
return str(soup)
276+
257277

258278
class TableBody(OntologyElement):
259279
description: str = Field("A body of the table", frozen=True)
@@ -430,6 +450,15 @@ class Form(OntologyElement):
430450
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
431451
allowed_tags: List[str] = Field(["form"], frozen=True)
432452

453+
def to_text(self, add_children=True) -> str:
454+
texts = [self.text] if self.text else []
455+
456+
if add_children:
457+
for child in self.children:
458+
texts.append(child.to_text(add_children=True))
459+
460+
return " ".join(filter(None, texts)).strip()
461+
433462

434463
class FormField(OntologyElement):
435464
description: str = Field("A property value of a form", frozen=True)
@@ -442,6 +471,9 @@ class FormFieldValue(OntologyElement):
442471
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
443472
allowed_tags: List[str] = Field(["input"], frozen=True)
444473

474+
def to_text(self, add_children=True) -> str:
475+
return super().to_text() + self.additional_attributes.get("value", "")
476+
445477

446478
class Checkbox(OntologyElement):
447479
description: str = Field("A small box that can be checked or unchecked", frozen=True)

unstructured/partition/html/transformations.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,8 @@ def ontology_to_unstructured_elements(
9696
]
9797
element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name]
9898
html_code_of_ontology_element = ontology_element.to_html()
99-
element_text = (
100-
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
101-
)
102-
# TODO value attribute from form input should be added to the text
99+
element_text = ontology_element.to_text()
100+
103101
unstructured_element = element_class(
104102
text=element_text,
105103
element_id=ontology_element.id,

0 commit comments

Comments
 (0)