Skip to content

Commit 2417f8e

Browse files
authored
Fix when parent id is none for first element in v2 notion: (#3752)
1 parent 9835fe4 commit 2417f8e

File tree

5 files changed

+55
-2
lines changed

5 files changed

+55
-2
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## 0.16.3-dev1
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
9+
* **V2 elements without first parent ID can be parsed**
10+
11+
112
## 0.16.2
213

314
### Enhancements
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text
2+
from unstructured.documents.ontology import Document, Page, Paragraph
3+
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
4+
5+
6+
def test_when_first_elements_does_not_have_id():
7+
unstructured_elements = [
8+
Text(
9+
element_id="1",
10+
text="",
11+
metadata=ElementMetadata(text_as_html='<div class="Page" id="1"/>'),
12+
),
13+
NarrativeText(
14+
element_id="2",
15+
text="Example text",
16+
metadata=ElementMetadata(
17+
text_as_html='<p class="Paragraph" id="2"> Example text </p>', parent_id="1"
18+
),
19+
),
20+
]
21+
ontology = unstructured_elements_to_ontology(unstructured_elements)
22+
23+
assert isinstance(ontology, Document)
24+
25+
assert len(ontology.children) == 1
26+
page = ontology.children[0]
27+
28+
assert isinstance(page, Page)
29+
assert len(page.children) == 1
30+
paragraph = page.children[0]
31+
32+
assert isinstance(paragraph, Paragraph)
33+
assert paragraph.text == "Example text"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.2" # pragma: no cover
1+
__version__ = "0.16.3-dev1" # pragma: no cover

unstructured/documents/ontology.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,11 @@ def __init__(self, **kwargs):
6767
if self.html_tag_name == "":
6868
self.html_tag_name = self.allowed_tags[0]
6969
if "id" not in self.additional_attributes:
70-
self.additional_attributes["id"] = str(uuid.uuid4()).replace("-", "")
70+
self.additional_attributes["id"] = self.generate_unique_id()
71+
72+
@staticmethod
73+
def generate_unique_id() -> str:
74+
return str(uuid.uuid4()).replace("-", "")
7175

7276
def to_html(self, add_children=True) -> str:
7377
additional_attrs = copy(self.additional_attributes)

unstructured/partition/html/transformations.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element])
135135
id_to_element_mapping = OrderedDict()
136136

137137
document_element_id = unstructured_elements[0].metadata.parent_id
138+
139+
if document_element_id is None:
140+
document_element_id = OntologyElement.generate_unique_id()
141+
unstructured_elements[0].metadata.parent_id = document_element_id
142+
138143
id_to_element_mapping[document_element_id] = Document(
139144
additional_attributes={"id": document_element_id}
140145
)

0 commit comments

Comments
 (0)