Skip to content

Commit e1babf0

Browse files
authored
Define default HTML to ontology mapping (#3784)
1 parent ca27b8a commit e1babf0

File tree

9 files changed

+269
-56
lines changed

9 files changed

+269
-56
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.16.6-dev0
1+
## 0.16.6-dev1
22

33
### Enhancements
44
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
5+
- **Every HTML has default ontology class assigned** When parsing HTML to ontology each defined HTML in the Ontology has assigned default ontology class. This way it is possible to assign ontology class instead of UncategorizedText when the HTML tag is predicted correctly without class assigned class
56

67
### Features
78

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
<body class="Document" id="517f8559ba594270bdd67e1b02bf19a2">
2+
<table class="Table" id="2428404551304d4db5925f6afee11ed5">
3+
<tr>
4+
<th>
5+
Header 1
6+
</th>
7+
<th>
8+
Header 2
9+
</th>
10+
</tr>
11+
<tr>
12+
<td>
13+
Row 1, Cell 1
14+
</td>
15+
<td>
16+
Row 1, Cell 2
17+
</td>
18+
</tr>
19+
<tr>
20+
<td>
21+
Row 2, Cell 1
22+
</td>
23+
<td>
24+
Row 2, Cell 2
25+
</td>
26+
</tr>
27+
</table>
28+
<table id="9f91cae321c74b31bb1c83ac86cd7afb">
29+
<tr>
30+
<th colspan="3">
31+
Big Table Header
32+
</th>
33+
</tr>
34+
<tr>
35+
<td rowspan="2">
36+
Merged Cell 1
37+
</td>
38+
<td>
39+
Cell 2
40+
</td>
41+
<td>
42+
Cell 3
43+
</td>
44+
</tr>
45+
<tr>
46+
<td colspan="2">
47+
Merged Cell 4 and 5
48+
</td>
49+
</tr>
50+
<tr>
51+
<td>
52+
Cell 6
53+
</td>
54+
<td>
55+
Cell 7
56+
</td>
57+
<td>
58+
Cell 8
59+
</td>
60+
</tr>
61+
<tr>
62+
<td>
63+
Cell 9
64+
</td>
65+
<td colspan="2">
66+
A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
67+
</td>
68+
</tr>
69+
<tr>
70+
<td>
71+
Cell 10
72+
</td>
73+
<td>
74+
Cell 11
75+
</td>
76+
<td>
77+
Cell 12
78+
</td>
79+
</tr>
80+
</table>
81+
<table class="TableOfContents" id="da6c34391e544b3480e45d68f40870fa">
82+
<tr>
83+
<th>
84+
Chapter
85+
</th>
86+
<th>
87+
Title
88+
</th>
89+
<th>
90+
Page
91+
</th>
92+
</tr>
93+
<tr>
94+
<td>
95+
1
96+
</td>
97+
<td>
98+
Introduction
99+
</td>
100+
<td>
101+
1
102+
</td>
103+
</tr>
104+
<tr>
105+
<td>
106+
2
107+
</td>
108+
<td>
109+
Getting Started
110+
</td>
111+
<td>
112+
5
113+
</td>
114+
</tr>
115+
<tr>
116+
<td>
117+
3
118+
</td>
119+
<td>
120+
Basic Concepts
121+
</td>
122+
<td>
123+
12
124+
</td>
125+
</tr>
126+
<tr>
127+
<td>
128+
4
129+
</td>
130+
<td>
131+
Advanced Topics
132+
</td>
133+
<td>
134+
25
135+
</td>
136+
</tr>
137+
<tr>
138+
<td>
139+
5
140+
</td>
141+
<td>
142+
Conclusion
143+
</td>
144+
<td>
145+
40
146+
</td>
147+
</tr>
148+
</table>
149+
</body>

test_unstructured/documents/test_mappings.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from collections import defaultdict
2-
from typing import Dict, Type
2+
from typing import Type
33

44
from unstructured.documents import elements, ontology
55
from unstructured.documents.mappings import (
@@ -11,27 +11,20 @@
1111
from unstructured.documents.ontology import OntologyElement
1212

1313

14-
def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
15-
"""
16-
Get a mapping of HTML tags to their exclusive OntologyElement types.
17-
"""
18-
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
19-
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
20-
for tag in element_type().allowed_tags:
21-
html_tag_to_element_type_mappings[tag].append(element_type)
14+
def test_if_all_html_tags_have_default_ontology_type():
15+
html_tag_to_possible_ontology_classes: dict[str, list[Type[ontology.OntologyElement]]] = (
16+
defaultdict(list)
17+
)
2218

23-
return {
24-
tag: element_types[0]
25-
for tag, element_types in html_tag_to_element_type_mappings.items()
26-
if len(element_types) == 1
27-
}
19+
for ontology_class in ALL_ONTOLOGY_ELEMENT_TYPES:
20+
for tag in ontology_class().allowed_tags:
21+
html_tag_to_possible_ontology_classes[tag].append(ontology_class)
2822

29-
30-
def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
31-
exclusive_html_tags = _get_exclusive_html_tags()
32-
for expected_tag, expected_element_type in exclusive_html_tags.items():
33-
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
34-
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
23+
for html_tag, possible_ontology_classes in html_tag_to_possible_ontology_classes.items():
24+
assert html_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
25+
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[html_tag] in possible_ontology_classes + [
26+
ontology.UncategorizedText
27+
] # In some cases it is better to use unknown type than assign incorrect type
3528

3629

3730
def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():

test_unstructured/documents/test_ontology_to_unstructured_parsing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
181181
[
182182
("html_files/example.html", "unstructured_json_output/example.json"),
183183
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
184+
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
184185
(
185186
"html_files/example_with_inline_fields.html",
186187
"unstructured_json_output/example_with_inline_fields.json",
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
[
2+
{
3+
"element_id": "2428404551304d4db5925f6afee11ed5",
4+
"metadata": {
5+
"category_depth": 0,
6+
"filetype": "text/html",
7+
"languages": [
8+
"eng"
9+
],
10+
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
11+
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"> <tr> <th>Header 1</th><th>Header 2</th></tr><tr> <td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr> <td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
12+
},
13+
"text": "Header 1 Header 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2",
14+
"type": "Table"
15+
},
16+
{
17+
"element_id": "9f91cae321c74b31bb1c83ac86cd7afb",
18+
"metadata": {
19+
"category_depth": 0,
20+
"filetype": "text/html",
21+
"languages": [
22+
"eng"
23+
],
24+
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
25+
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"> <tr> <th colspan=\"3\">Big Table Header</th></tr><tr> <td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr> <td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr> <td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr> <td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr> <td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
26+
},
27+
"text": "Big Table Header Merged Cell 1 Cell 2 Cell 3 Merged Cell 4 and 5 Cell 6 Cell 7 Cell 8 Cell 9 A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Cell 10 Cell 11 Cell 12",
28+
"type": "Table"
29+
},
30+
{
31+
"element_id": "da6c34391e544b3480e45d68f40870fa",
32+
"metadata": {
33+
"category_depth": 0,
34+
"filetype": "text/html",
35+
"languages": [
36+
"eng"
37+
],
38+
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
39+
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"> <tr> <th>Chapter</th><th>Title</th><th>Page</th></tr><tr> <td>1</td><td>Introduction</td><td>1</td></tr><tr> <td>2</td><td>Getting Started</td><td>5</td></tr><tr> <td>3</td><td>Basic Concepts</td><td>12</td></tr><tr> <td>4</td><td>Advanced Topics</td><td>25</td></tr><tr> <td>5</td><td>Conclusion</td><td>40</td></tr></table>"
40+
},
41+
"text": "Chapter Title Page 1 Introduction 1 2 Getting Started 5 3 Basic Concepts 12 4 Advanced Topics 25 5 Conclusion 40",
42+
"type": "Table"
43+
}
44+
]

test_unstructured/partition/html/test_html_to_ontology_parsing.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,8 +310,7 @@ def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mappi
310310
<div class="Page">
311311
<form class="Form">
312312
<label class="FormField" for="option1">
313-
<span class="UncategorizedText" type="radio" name="option1" value="2" checked>
314-
</span>
313+
<input class="Checkbox" type="radio" name="option1" value="2" checked />
315314
<span class="UncategorizedText">
316315
Option 1 (Checked)
317316
</span>

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.6-dev0" # pragma: no cover
1+
__version__ = "0.16.6-dev1" # pragma: no cover

unstructured/documents/mappings.py

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
9999
ontology.Keywords: elements.Text,
100100
ontology.Abstract: elements.NarrativeText,
101101
ontology.Hyperlink: elements.Text,
102-
ontology.TableOfContents: elements.Text,
102+
ontology.TableOfContents: elements.Table,
103103
ontology.Index: elements.Text,
104104
ontology.Form: elements.Text,
105105
ontology.FormField: elements.Text,
@@ -140,38 +140,56 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
140140
}
141141

142142
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
143+
"a": ontology.Hyperlink,
144+
"address": ontology.Address,
145+
"aside": ontology.Sidebar,
146+
"audio": ontology.Audio,
147+
"blockquote": ontology.Quote,
143148
"body": ontology.Document,
149+
"button": ontology.Button,
150+
"cite": ontology.Citation,
151+
"code": ontology.CodeBlock,
152+
"del": ontology.RevisionDeletion,
153+
"div": ontology.UncategorizedText,
154+
"dl": ontology.DefinitionList,
155+
"figcaption": ontology.Caption,
156+
"figure": ontology.Figure,
144157
"footer": ontology.Footer,
145-
"aside": ontology.Sidebar,
146-
"hr": ontology.PageBreak,
158+
"form": ontology.Form,
159+
"h1": ontology.Title,
160+
"h2": ontology.Subtitle,
147161
"h3": ontology.Heading,
148162
"h4": ontology.Heading,
149163
"h5": ontology.Heading,
150164
"h6": ontology.Heading,
151-
"blockquote": ontology.Quote,
152-
"figcaption": ontology.Caption,
153-
"ol": ontology.OrderedList,
165+
"header": ontology.Header,
166+
"hr": ontology.PageBreak,
167+
"img": ontology.Image,
168+
"input": ontology.Checkbox,
169+
"ins": ontology.RevisionInsertion,
170+
"label": ontology.FormField,
154171
"li": ontology.ListItem,
172+
"mark": ontology.Highlight,
173+
"math": ontology.Equation,
174+
"meta": ontology.Keywords,
175+
"nav": ontology.Index,
176+
"ol": ontology.OrderedList,
177+
"p": ontology.Paragraph,
178+
"pre": ontology.CodeBlock,
179+
"section": ontology.Section,
180+
"span": ontology.UncategorizedText,
181+
"sub": ontology.FootnoteReference,
182+
"svg": ontology.Signature,
183+
"table": ontology.Table,
155184
"tbody": ontology.TableBody,
156-
"thead": ontology.TableHeader,
157-
"tr": ontology.TableRow,
158185
"td": ontology.TableCell,
159186
"th": ontology.TableCellHeader,
160-
"figure": ontology.Figure,
187+
"thead": ontology.TableHeader,
188+
"time": ontology.Time,
189+
"tr": ontology.TableRow,
190+
"ul": ontology.UnorderedList,
161191
"video": ontology.Video,
162-
"audio": ontology.Audio,
163-
"pre": ontology.CodeBlock,
164-
"sub": ontology.FootnoteReference,
165-
"cite": ontology.Citation,
166-
"nav": ontology.Index,
167-
"form": ontology.Form,
168-
"label": ontology.FormField,
169-
"button": ontology.Button,
170-
"mark": ontology.Highlight,
171-
"ins": ontology.RevisionInsertion,
172-
"del": ontology.RevisionDeletion,
173-
"address": ontology.Address,
174-
"table": ontology.Table,
175192
}
176193

194+
177195
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()

0 commit comments

Comments
 (0)