From 4c74ef9ee58192ece5b89c854973a351e7ea5834 Mon Sep 17 00:00:00 2001 From: Teddy Wahle Date: Mon, 12 May 2025 17:42:27 -0700 Subject: [PATCH 1/3] Better handling for synced_block feature in Notion connector --- .../notion/types/blocks/synced_block.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py index c1456643f..9347d0969 100644 --- a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +++ b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py @@ -18,6 +18,9 @@ def can_have_children() -> bool: @classmethod def from_dict(cls, data: dict): + # Original blocks contain children content + if "children" not in data: + raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}") return cls(children=data["children"]) def get_html(self) -> Optional[HtmlTag]: @@ -31,27 +34,56 @@ class DuplicateSyncedBlock(BlockBase): @staticmethod def can_have_children() -> bool: + # Duplicate blocks themselves don't have children directly fetched here, + # but they represent content that does, so Notion API might report has_children=True + # on the parent block object. The actual children are fetched from the original block. return True @classmethod def from_dict(cls, data: dict): - return cls(**data) + # Duplicate blocks contain a 'synced_from' reference + synced_from_data = data.get("synced_from") + if not synced_from_data or not isinstance(synced_from_data, dict): + raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}") + # Ensure required keys are present in the nested dictionary + if "type" not in synced_from_data or "block_id" not in synced_from_data: + raise ValueError( + f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}" + ) + return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"]) def get_html(self) -> Optional[HtmlTag]: + # HTML representation might need fetching the original block's content, + # which is outside the scope of this simple data class. return None class SyncBlock(BlockBase): @staticmethod def can_have_children() -> bool: + # Synced blocks (both original and duplicate) can conceptually have children. return True @classmethod def from_dict(cls, data: dict): - if "synced_from" in data: + # Determine if it's a duplicate (has 'synced_from') or original (has 'children') + if data.get("synced_from") is not None: + # It's a duplicate block containing a reference + return DuplicateSyncedBlock.from_dict(data) + elif "children" in data: + # It's an original block containing children return OriginalSyncedBlock.from_dict(data) else: - return DuplicateSyncedBlock.from_dict(data) + # Handle cases where neither 'synced_from' nor 'children' are present. + # Notion API might return this for an empty original synced block. + # Let's treat it as an empty OriginalSyncedBlock. + # If this assumption is wrong, errors might occur later. + # Consider logging a warning here if strictness is needed. + return OriginalSyncedBlock(children=[]) + def get_html(self) -> Optional[HtmlTag]: + # The specific instance returned by from_dict (Original or Duplicate) + # will handle its own get_html logic. + # This method on the base SyncBlock might not be directly called. return None From f7a2112f179fdf845212f1d577090610c351baad Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 5 Jun 2025 11:39:13 +0200 Subject: [PATCH 2/3] version bump --- CHANGELOG.md | 4 ++++ unstructured_ingest/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5307e3e3..89e9ae463 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.0.36 + +* **Added Notion connector sync block handling by teddysupercuts** + ## 1.0.35 * **Fix output path in blob storage destination connector** diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index b7e659e14..9bd3a82ef 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "1.0.35" # pragma: no cover +__version__ = "1.0.36" # pragma: no cover From bc3b895356a5bd2b6ff876d8c49b0b4848a3ecab Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Thu, 5 Jun 2025 12:16:25 +0200 Subject: [PATCH 3/3] Moved comments to docstrings --- .../notion/types/blocks/synced_block.py | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py index 9347d0969..08797bb19 100644 --- a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +++ b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py @@ -18,7 +18,10 @@ def can_have_children() -> bool: @classmethod def from_dict(cls, data: dict): - # Original blocks contain children content + """Create OriginalSyncedBlock from dictionary data. + + Original blocks contain children content. + """ if "children" not in data: raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}") return cls(children=data["children"]) @@ -34,14 +37,20 @@ class DuplicateSyncedBlock(BlockBase): @staticmethod def can_have_children() -> bool: - # Duplicate blocks themselves don't have children directly fetched here, - # but they represent content that does, so Notion API might report has_children=True - # on the parent block object. The actual children are fetched from the original block. + """Check if duplicate synced blocks can have children. + + Duplicate blocks themselves don't have children directly fetched here, + but they represent content that does, so Notion API might report has_children=True + on the parent block object. The actual children are fetched from the original block. + """ return True @classmethod def from_dict(cls, data: dict): - # Duplicate blocks contain a 'synced_from' reference + """Create DuplicateSyncedBlock from dictionary data. + + Duplicate blocks contain a 'synced_from' reference. + """ synced_from_data = data.get("synced_from") if not synced_from_data or not isinstance(synced_from_data, dict): raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}") @@ -53,20 +62,29 @@ def from_dict(cls, data: dict): return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"]) def get_html(self) -> Optional[HtmlTag]: - # HTML representation might need fetching the original block's content, - # which is outside the scope of this simple data class. + """Get HTML representation of the duplicate synced block. + + HTML representation might need fetching the original block's content, + which is outside the scope of this simple data class. + """ return None class SyncBlock(BlockBase): @staticmethod def can_have_children() -> bool: - # Synced blocks (both original and duplicate) can conceptually have children. + """Check if synced blocks can have children. + + Synced blocks (both original and duplicate) can conceptually have children. + """ return True @classmethod def from_dict(cls, data: dict): - # Determine if it's a duplicate (has 'synced_from') or original (has 'children') + """Create appropriate SyncedBlock subclass from dictionary data. + + Determine if it's a duplicate (has 'synced_from') or original (has 'children'). + """ if data.get("synced_from") is not None: # It's a duplicate block containing a reference return DuplicateSyncedBlock.from_dict(data) @@ -83,7 +101,10 @@ def from_dict(cls, data: dict): def get_html(self) -> Optional[HtmlTag]: - # The specific instance returned by from_dict (Original or Duplicate) - # will handle its own get_html logic. - # This method on the base SyncBlock might not be directly called. + """Get HTML representation of the synced block. + + The specific instance returned by from_dict (Original or Duplicate) + will handle its own get_html logic. + This method on the base SyncBlock might not be directly called. + """ return None