55of parsed documents
66"""
77
8- from collections import defaultdict
98from typing import Any , Dict , Type
109
11- from unstructured .documents .ontology import OntologyElement
10+ from unstructured .documents import elements , ontology
11+ from unstructured .documents .elements import Element
1212
1313
1414def get_all_subclasses (cls ) -> list [Any ]:
@@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
3030 return all_subclasses
3131
3232
33- def get_exclusive_html_tags () -> dict [str , Type [ OntologyElement ] ]:
33+ def get_ontology_to_unstructured_type_mapping () -> dict [str , Element ]:
3434 """
35- Get a mapping of HTML tags to their exclusive OntologyElement types.
36- """
37- html_tag_to_element_type_mappings : Dict [str , list [Type [OntologyElement ]]] = defaultdict (list )
38- for element_type in ALL_ONTOLOGY_ELEMENT_TYPES :
39- for tag in element_type ().allowed_tags :
40- html_tag_to_element_type_mappings [tag ].append (element_type )
41-
42- return {
43- tag : element_types [0 ]
44- for tag , element_types in html_tag_to_element_type_mappings .items ()
45- if len (element_types ) == 1
46- }
47-
48-
49- def get_ontology_to_unstructured_type_mapping () -> dict [str , str ]:
50- """
51- Get a mapping of ontology element names to unstructured type names.
35+ Get a mapping of ontology element to unstructured type.
5236
5337 The dictionary here was created base on ontology mapping json
5438 Can be generated via the following code:
@@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
6347 ```
6448
6549 Returns:
66- dict: A dictionary where keys are ontology element class names
67- and values are unstructured type names .
50+ dict: A dictionary where keys are ontology element classes
51+ and values are unstructured types .
6852 """
6953 ontology_to_unstructured_class_mapping = {
70- " Document" : "UncategorizedText" ,
71- " Section" : "UncategorizedText" ,
72- " Page" : "UncategorizedText" ,
73- " Column" : "UncategorizedText" ,
74- " Paragraph" : " NarrativeText" ,
75- " Header" : " Header" ,
76- " Footer" : " Footer" ,
77- " Sidebar" : "UncategorizedText" ,
78- " PageBreak" : " PageBreak" ,
79- " Title" : " Title" ,
80- " Subtitle" : " Title" ,
81- " Heading" : " Title" ,
82- " NarrativeText" : " NarrativeText" ,
83- " Quote" : " NarrativeText" ,
84- " Footnote" : "UncategorizedText" ,
85- " Caption" : " FigureCaption" ,
86- " PageNumber" : " PageNumber" ,
87- " UncategorizedText" : "UncategorizedText" ,
88- " OrderedList" : "UncategorizedText" ,
89- " UnorderedList" : "UncategorizedText" ,
90- " DefinitionList" : "UncategorizedText" ,
91- " ListItem" : " ListItem" ,
92- " Table" : " Table" ,
93- " TableRow" : " Table" ,
94- " TableCell" : " Table" ,
95- " TableCellHeader" : " Table" ,
96- " TableBody" : " Table" ,
97- " TableHeader" : " Table" ,
98- " Image" : " Image" ,
99- " Figure" : " Image" ,
100- " Video" : "UncategorizedText" ,
101- " Audio" : "UncategorizedText" ,
102- " Barcode" : " Image" ,
103- " QRCode" : " Image" ,
104- " Logo" : " Image" ,
105- " CodeBlock" : " CodeSnippet" ,
106- " InlineCode" : " CodeSnippet" ,
107- " Formula" : " Formula" ,
108- " Equation" : " Formula" ,
109- " FootnoteReference" : "UncategorizedText" ,
110- " Citation" : "UncategorizedText" ,
111- " Bibliography" : "UncategorizedText" ,
112- " Glossary" : "UncategorizedText" ,
113- " Author" : "UncategorizedText" ,
114- " MetaDate" : "UncategorizedText" ,
115- " Keywords" : "UncategorizedText" ,
116- " Abstract" : " NarrativeText" ,
117- " Hyperlink" : "UncategorizedText" ,
118- " TableOfContents" : "UncategorizedText" ,
119- " Index" : "UncategorizedText" ,
120- " Form" : "UncategorizedText" ,
121- " FormField" : "UncategorizedText" ,
122- " FormFieldValue" : "UncategorizedText" ,
123- " Checkbox" : "UncategorizedText" ,
124- " RadioButton" : "UncategorizedText" ,
125- " Button" : "UncategorizedText" ,
126- " Comment" : "UncategorizedText" ,
127- " Highlight" : "UncategorizedText" ,
128- " RevisionInsertion" : "UncategorizedText" ,
129- " RevisionDeletion" : "UncategorizedText" ,
130- " Address" : " Address" ,
131- " EmailAddress" : " EmailAddress" ,
132- " PhoneNumber" : "UncategorizedText" ,
133- " CalendarDate" : "UncategorizedText" ,
134- " Time" : "UncategorizedText" ,
135- " Currency" : "UncategorizedText" ,
136- " Measurement" : "UncategorizedText" ,
137- " Letterhead" : " Header" ,
138- " Signature" : "UncategorizedText" ,
139- " Watermark" : "UncategorizedText" ,
140- " Stamp" : "UncategorizedText" ,
54+ ontology . Document : elements . Text ,
55+ ontology . Section : elements . Text ,
56+ ontology . Page : elements . Text ,
57+ ontology . Column : elements . Text ,
58+ ontology . Paragraph : elements . NarrativeText ,
59+ ontology . Header : elements . Header ,
60+ ontology . Footer : elements . Footer ,
61+ ontology . Sidebar : elements . Text ,
62+ ontology . PageBreak : elements . PageBreak ,
63+ ontology . Title : elements . Title ,
64+ ontology . Subtitle : elements . Title ,
65+ ontology . Heading : elements . Title ,
66+ ontology . NarrativeText : elements . NarrativeText ,
67+ ontology . Quote : elements . NarrativeText ,
68+ ontology . Footnote : elements . Text ,
69+ ontology . Caption : elements . FigureCaption ,
70+ ontology . PageNumber : elements . PageNumber ,
71+ ontology . UncategorizedText : elements . Text ,
72+ ontology . OrderedList : elements . Text ,
73+ ontology . UnorderedList : elements . Text ,
74+ ontology . DefinitionList : elements . Text ,
75+ ontology . ListItem : elements . ListItem ,
76+ ontology . Table : elements . Table ,
77+ ontology . TableRow : elements . Table ,
78+ ontology . TableCell : elements . Table ,
79+ ontology . TableCellHeader : elements . Table ,
80+ ontology . TableBody : elements . Table ,
81+ ontology . TableHeader : elements . Table ,
82+ ontology . Image : elements . Image ,
83+ ontology . Figure : elements . Image ,
84+ ontology . Video : elements . Text ,
85+ ontology . Audio : elements . Text ,
86+ ontology . Barcode : elements . Image ,
87+ ontology . QRCode : elements . Image ,
88+ ontology . Logo : elements . Image ,
89+ ontology . CodeBlock : elements . CodeSnippet ,
90+ ontology . InlineCode : elements . CodeSnippet ,
91+ ontology . Formula : elements . Formula ,
92+ ontology . Equation : elements . Formula ,
93+ ontology . FootnoteReference : elements . Text ,
94+ ontology . Citation : elements . Text ,
95+ ontology . Bibliography : elements . Text ,
96+ ontology . Glossary : elements . Text ,
97+ ontology . Author : elements . Text ,
98+ ontology . MetaDate : elements . Text ,
99+ ontology . Keywords : elements . Text ,
100+ ontology . Abstract : elements . NarrativeText ,
101+ ontology . Hyperlink : elements . Text ,
102+ ontology . TableOfContents : elements . Text ,
103+ ontology . Index : elements . Text ,
104+ ontology . Form : elements . Text ,
105+ ontology . FormField : elements . Text ,
106+ ontology . FormFieldValue : elements . Text ,
107+ ontology . Checkbox : elements . Text ,
108+ ontology . RadioButton : elements . Text ,
109+ ontology . Button : elements . Text ,
110+ ontology . Comment : elements . Text ,
111+ ontology . Highlight : elements . Text ,
112+ ontology . RevisionInsertion : elements . Text ,
113+ ontology . RevisionDeletion : elements . Text ,
114+ ontology . Address : elements . Address ,
115+ ontology . EmailAddress : elements . EmailAddress ,
116+ ontology . PhoneNumber : elements . Text ,
117+ ontology . CalendarDate : elements . Text ,
118+ ontology . Time : elements . Text ,
119+ ontology . Currency : elements . Text ,
120+ ontology . Measurement : elements . Text ,
121+ ontology . Letterhead : elements . Header ,
122+ ontology . Signature : elements . Text ,
123+ ontology . Watermark : elements . Text ,
124+ ontology . Stamp : elements . Text ,
141125 }
142126
143127 return ontology_to_unstructured_class_mapping
144128
145129
146- ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses (OntologyElement )
147- HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP : Dict [tuple [str , str ], Type [OntologyElement ]] = {
130+ ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses (ontology . OntologyElement )
131+ HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP : Dict [tuple [str , str ], Type [ontology . OntologyElement ]] = {
148132 (tag , element_type ().css_class_name ): element_type
149133 for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
150134 for tag in element_type ().allowed_tags
151135}
152- CSS_CLASS_TO_ELEMENT_TYPE_MAP : Dict [str , Type [OntologyElement ]] = {
136+ CSS_CLASS_TO_ELEMENT_TYPE_MAP : Dict [str , Type [ontology . OntologyElement ]] = {
153137 element_type ().css_class_name : element_type
154138 for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
155139 for tag in element_type ().allowed_tags
156140}
157141
158- EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP : Dict [str , Type [OntologyElement ]] = get_exclusive_html_tags ()
159- ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping ()
142+ HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP : Dict [str , Type [ontology .OntologyElement ]] = {
143+ "body" : ontology .Document ,
144+ "footer" : ontology .Footer ,
145+ "aside" : ontology .Sidebar ,
146+ "hr" : ontology .PageBreak ,
147+ "h3" : ontology .Heading ,
148+ "h4" : ontology .Heading ,
149+ "h5" : ontology .Heading ,
150+ "h6" : ontology .Heading ,
151+ "blockquote" : ontology .Quote ,
152+ "figcaption" : ontology .Caption ,
153+ "ol" : ontology .OrderedList ,
154+ "li" : ontology .ListItem ,
155+ "tbody" : ontology .TableBody ,
156+ "thead" : ontology .TableHeader ,
157+ "tr" : ontology .TableRow ,
158+ "td" : ontology .TableCell ,
159+ "th" : ontology .TableCellHeader ,
160+ "figure" : ontology .Figure ,
161+ "video" : ontology .Video ,
162+ "audio" : ontology .Audio ,
163+ "pre" : ontology .CodeBlock ,
164+ "sub" : ontology .FootnoteReference ,
165+ "cite" : ontology .Citation ,
166+ "nav" : ontology .Index ,
167+ "form" : ontology .Form ,
168+ "label" : ontology .FormField ,
169+ "button" : ontology .Button ,
170+ "mark" : ontology .Highlight ,
171+ "ins" : ontology .RevisionInsertion ,
172+ "del" : ontology .RevisionDeletion ,
173+ "address" : ontology .Address ,
174+ "table" : ontology .Table ,
175+ }
176+
177+ ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping ()
0 commit comments