Skip to content

Commit ca27b8a

Browse files
authored
Set <table> to be ontology.Table not UncategorizedText (#3782)
1 parent a6aefee commit ca27b8a

File tree

5 files changed

+231
-164
lines changed

5 files changed

+231
-164
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.16.6-dev0
2+
3+
### Enhancements
4+
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
5+
6+
### Features
7+
8+
### Fixes
9+
110
## 0.16.5
211

312
### Enhancements
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from collections import defaultdict
2+
from typing import Dict, Type
3+
4+
from unstructured.documents import elements, ontology
5+
from unstructured.documents.mappings import (
6+
ALL_ONTOLOGY_ELEMENT_TYPES,
7+
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
8+
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
9+
get_all_subclasses,
10+
)
11+
from unstructured.documents.ontology import OntologyElement
12+
13+
14+
def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
15+
"""
16+
Get a mapping of HTML tags to their exclusive OntologyElement types.
17+
"""
18+
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
19+
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
20+
for tag in element_type().allowed_tags:
21+
html_tag_to_element_type_mappings[tag].append(element_type)
22+
23+
return {
24+
tag: element_types[0]
25+
for tag, element_types in html_tag_to_element_type_mappings.items()
26+
if len(element_types) == 1
27+
}
28+
29+
30+
def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
31+
exclusive_html_tags = _get_exclusive_html_tags()
32+
for expected_tag, expected_element_type in exclusive_html_tags.items():
33+
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
34+
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
35+
36+
37+
def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
38+
for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
39+
assert issubclass(element_type, OntologyElement)
40+
41+
42+
def test_ontology_to_unstructured_mapping_has_valid_types():
43+
for (
44+
ontology_element,
45+
unstructured_element,
46+
) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
47+
assert issubclass(unstructured_element, elements.Element)
48+
assert issubclass(ontology_element, ontology.OntologyElement)
49+
50+
51+
def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
52+
for ontology_element in get_all_subclasses(ontology.OntologyElement):
53+
assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.5" # pragma: no cover
1+
__version__ = "0.16.6-dev0" # pragma: no cover

unstructured/documents/mappings.py

Lines changed: 116 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
of parsed documents
66
"""
77

8-
from collections import defaultdict
98
from typing import Any, Dict, Type
109

11-
from unstructured.documents.ontology import OntologyElement
10+
from unstructured.documents import elements, ontology
11+
from unstructured.documents.elements import Element
1212

1313

1414
def get_all_subclasses(cls) -> list[Any]:
@@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
3030
return all_subclasses
3131

3232

33-
def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
33+
def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
3434
"""
35-
Get a mapping of HTML tags to their exclusive OntologyElement types.
36-
"""
37-
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
38-
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
39-
for tag in element_type().allowed_tags:
40-
html_tag_to_element_type_mappings[tag].append(element_type)
41-
42-
return {
43-
tag: element_types[0]
44-
for tag, element_types in html_tag_to_element_type_mappings.items()
45-
if len(element_types) == 1
46-
}
47-
48-
49-
def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
50-
"""
51-
Get a mapping of ontology element names to unstructured type names.
35+
Get a mapping of ontology element to unstructured type.
5236
5337
The dictionary here was created base on ontology mapping json
5438
Can be generated via the following code:
@@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
6347
```
6448
6549
Returns:
66-
dict: A dictionary where keys are ontology element class names
67-
and values are unstructured type names.
50+
dict: A dictionary where keys are ontology element classes
51+
and values are unstructured types.
6852
"""
6953
ontology_to_unstructured_class_mapping = {
70-
"Document": "UncategorizedText",
71-
"Section": "UncategorizedText",
72-
"Page": "UncategorizedText",
73-
"Column": "UncategorizedText",
74-
"Paragraph": "NarrativeText",
75-
"Header": "Header",
76-
"Footer": "Footer",
77-
"Sidebar": "UncategorizedText",
78-
"PageBreak": "PageBreak",
79-
"Title": "Title",
80-
"Subtitle": "Title",
81-
"Heading": "Title",
82-
"NarrativeText": "NarrativeText",
83-
"Quote": "NarrativeText",
84-
"Footnote": "UncategorizedText",
85-
"Caption": "FigureCaption",
86-
"PageNumber": "PageNumber",
87-
"UncategorizedText": "UncategorizedText",
88-
"OrderedList": "UncategorizedText",
89-
"UnorderedList": "UncategorizedText",
90-
"DefinitionList": "UncategorizedText",
91-
"ListItem": "ListItem",
92-
"Table": "Table",
93-
"TableRow": "Table",
94-
"TableCell": "Table",
95-
"TableCellHeader": "Table",
96-
"TableBody": "Table",
97-
"TableHeader": "Table",
98-
"Image": "Image",
99-
"Figure": "Image",
100-
"Video": "UncategorizedText",
101-
"Audio": "UncategorizedText",
102-
"Barcode": "Image",
103-
"QRCode": "Image",
104-
"Logo": "Image",
105-
"CodeBlock": "CodeSnippet",
106-
"InlineCode": "CodeSnippet",
107-
"Formula": "Formula",
108-
"Equation": "Formula",
109-
"FootnoteReference": "UncategorizedText",
110-
"Citation": "UncategorizedText",
111-
"Bibliography": "UncategorizedText",
112-
"Glossary": "UncategorizedText",
113-
"Author": "UncategorizedText",
114-
"MetaDate": "UncategorizedText",
115-
"Keywords": "UncategorizedText",
116-
"Abstract": "NarrativeText",
117-
"Hyperlink": "UncategorizedText",
118-
"TableOfContents": "UncategorizedText",
119-
"Index": "UncategorizedText",
120-
"Form": "UncategorizedText",
121-
"FormField": "UncategorizedText",
122-
"FormFieldValue": "UncategorizedText",
123-
"Checkbox": "UncategorizedText",
124-
"RadioButton": "UncategorizedText",
125-
"Button": "UncategorizedText",
126-
"Comment": "UncategorizedText",
127-
"Highlight": "UncategorizedText",
128-
"RevisionInsertion": "UncategorizedText",
129-
"RevisionDeletion": "UncategorizedText",
130-
"Address": "Address",
131-
"EmailAddress": "EmailAddress",
132-
"PhoneNumber": "UncategorizedText",
133-
"CalendarDate": "UncategorizedText",
134-
"Time": "UncategorizedText",
135-
"Currency": "UncategorizedText",
136-
"Measurement": "UncategorizedText",
137-
"Letterhead": "Header",
138-
"Signature": "UncategorizedText",
139-
"Watermark": "UncategorizedText",
140-
"Stamp": "UncategorizedText",
54+
ontology.Document: elements.Text,
55+
ontology.Section: elements.Text,
56+
ontology.Page: elements.Text,
57+
ontology.Column: elements.Text,
58+
ontology.Paragraph: elements.NarrativeText,
59+
ontology.Header: elements.Header,
60+
ontology.Footer: elements.Footer,
61+
ontology.Sidebar: elements.Text,
62+
ontology.PageBreak: elements.PageBreak,
63+
ontology.Title: elements.Title,
64+
ontology.Subtitle: elements.Title,
65+
ontology.Heading: elements.Title,
66+
ontology.NarrativeText: elements.NarrativeText,
67+
ontology.Quote: elements.NarrativeText,
68+
ontology.Footnote: elements.Text,
69+
ontology.Caption: elements.FigureCaption,
70+
ontology.PageNumber: elements.PageNumber,
71+
ontology.UncategorizedText: elements.Text,
72+
ontology.OrderedList: elements.Text,
73+
ontology.UnorderedList: elements.Text,
74+
ontology.DefinitionList: elements.Text,
75+
ontology.ListItem: elements.ListItem,
76+
ontology.Table: elements.Table,
77+
ontology.TableRow: elements.Table,
78+
ontology.TableCell: elements.Table,
79+
ontology.TableCellHeader: elements.Table,
80+
ontology.TableBody: elements.Table,
81+
ontology.TableHeader: elements.Table,
82+
ontology.Image: elements.Image,
83+
ontology.Figure: elements.Image,
84+
ontology.Video: elements.Text,
85+
ontology.Audio: elements.Text,
86+
ontology.Barcode: elements.Image,
87+
ontology.QRCode: elements.Image,
88+
ontology.Logo: elements.Image,
89+
ontology.CodeBlock: elements.CodeSnippet,
90+
ontology.InlineCode: elements.CodeSnippet,
91+
ontology.Formula: elements.Formula,
92+
ontology.Equation: elements.Formula,
93+
ontology.FootnoteReference: elements.Text,
94+
ontology.Citation: elements.Text,
95+
ontology.Bibliography: elements.Text,
96+
ontology.Glossary: elements.Text,
97+
ontology.Author: elements.Text,
98+
ontology.MetaDate: elements.Text,
99+
ontology.Keywords: elements.Text,
100+
ontology.Abstract: elements.NarrativeText,
101+
ontology.Hyperlink: elements.Text,
102+
ontology.TableOfContents: elements.Text,
103+
ontology.Index: elements.Text,
104+
ontology.Form: elements.Text,
105+
ontology.FormField: elements.Text,
106+
ontology.FormFieldValue: elements.Text,
107+
ontology.Checkbox: elements.Text,
108+
ontology.RadioButton: elements.Text,
109+
ontology.Button: elements.Text,
110+
ontology.Comment: elements.Text,
111+
ontology.Highlight: elements.Text,
112+
ontology.RevisionInsertion: elements.Text,
113+
ontology.RevisionDeletion: elements.Text,
114+
ontology.Address: elements.Address,
115+
ontology.EmailAddress: elements.EmailAddress,
116+
ontology.PhoneNumber: elements.Text,
117+
ontology.CalendarDate: elements.Text,
118+
ontology.Time: elements.Text,
119+
ontology.Currency: elements.Text,
120+
ontology.Measurement: elements.Text,
121+
ontology.Letterhead: elements.Header,
122+
ontology.Signature: elements.Text,
123+
ontology.Watermark: elements.Text,
124+
ontology.Stamp: elements.Text,
141125
}
142126

143127
return ontology_to_unstructured_class_mapping
144128

145129

146-
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement)
147-
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = {
130+
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement)
131+
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = {
148132
(tag, element_type().css_class_name): element_type
149133
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
150134
for tag in element_type().allowed_tags
151135
}
152-
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = {
136+
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
153137
element_type().css_class_name: element_type
154138
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
155139
for tag in element_type().allowed_tags
156140
}
157141

158-
EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags()
159-
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping()
142+
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
143+
"body": ontology.Document,
144+
"footer": ontology.Footer,
145+
"aside": ontology.Sidebar,
146+
"hr": ontology.PageBreak,
147+
"h3": ontology.Heading,
148+
"h4": ontology.Heading,
149+
"h5": ontology.Heading,
150+
"h6": ontology.Heading,
151+
"blockquote": ontology.Quote,
152+
"figcaption": ontology.Caption,
153+
"ol": ontology.OrderedList,
154+
"li": ontology.ListItem,
155+
"tbody": ontology.TableBody,
156+
"thead": ontology.TableHeader,
157+
"tr": ontology.TableRow,
158+
"td": ontology.TableCell,
159+
"th": ontology.TableCellHeader,
160+
"figure": ontology.Figure,
161+
"video": ontology.Video,
162+
"audio": ontology.Audio,
163+
"pre": ontology.CodeBlock,
164+
"sub": ontology.FootnoteReference,
165+
"cite": ontology.Citation,
166+
"nav": ontology.Index,
167+
"form": ontology.Form,
168+
"label": ontology.FormField,
169+
"button": ontology.Button,
170+
"mark": ontology.Highlight,
171+
"ins": ontology.RevisionInsertion,
172+
"del": ontology.RevisionDeletion,
173+
"address": ontology.Address,
174+
"table": ontology.Table,
175+
}
176+
177+
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()

0 commit comments

Comments
 (0)