fix: search only for valid attachment links in Confluence page (#535)

ds-filipknefel · Filip Knefel · web-flow · commit b5d32686dde0 · 2025-06-11T12:57:28.000+02:00
Modify the `HtmlMixin` to allow for customized hyperlink tag search in subclasses using the mixin by overwriting a dedicated method.
Overwrite said method in the `ConfluenceDownloaderConfig` to target only embedded attachment files when running with `extract_files` flag.

---------

Co-authored-by: Filip Knefel &lt;filip@unstructured.io&gt;
diff --git a/.envrc b/.envrc
@@ -0,0 +1 @@
+export ZENDESK_TOKEN=MjdxjP8ffGEorZfUUOOdyiNlyhMYrf1KwvHucCez
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 1.0.40
+
+* **Fix extracting embedded files from Confluence pages**
+
 ## 1.0.39
 
 * **Added metadata export to milvus destination connector**
diff --git a/test/integration/connectors/test_milvus.py b/test/integration/connectors/test_milvus.py
@@ -90,12 +90,8 @@ def get_schema(enable_dynamic_field: bool = True) -> CollectionSchema:
         is_primary=True,
         auto_id=True,
     )
-    embeddings_field = FieldSchema(
-        name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384
-    )
-    record_id_field = FieldSchema(
-        name="record_id", dtype=DataType.VARCHAR, max_length=64
-    )
+    embeddings_field = FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384)
+    record_id_field = FieldSchema(name="record_id", dtype=DataType.VARCHAR, max_length=64)
 
     schema = CollectionSchema(
         enable_dynamic_field=enable_dynamic_field,
@@ -111,9 +107,7 @@ def get_schema(enable_dynamic_field: bool = True) -> CollectionSchema:
 
 def get_index_params() -> IndexParams:
     index_params = IndexParams()
-    index_params.add_index(
-        field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE"
-    )
+    index_params.add_index(field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE")
     index_params.add_index(field_name="record_id", index_type="Trie")
     return index_params
 
@@ -253,9 +247,7 @@ async def test_milvus_destination(
     logger.debug("\n--- Running test_milvus_destination ---")
     upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
     file_data = FileData(
-        source_identifiers=SourceIdentifiers(
-            fullpath=str(upload_file), filename=upload_file.stem
-        ),
+        source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
         connector_type=CONNECTOR_TYPE,
         identifier="mock file data",
     )
@@ -321,9 +313,7 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
     logger.debug("\n--- Running test_milvus_metadata_storage_with_dynamic_fields ---")
     upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
     file_data = FileData(
-        source_identifiers=SourceIdentifiers(
-            fullpath=str(upload_file), filename=upload_file.stem
-        ),
+        source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
         connector_type=CONNECTOR_TYPE,
         identifier="metadata_test_file",
     )
@@ -335,9 +325,7 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
     )
 
     # Verify dynamic fields are enabled
-    assert (
-        uploader.has_dynamic_fields_enabled()
-    ), "Collection should have dynamic fields enabled"
+    assert uploader.has_dynamic_fields_enabled(), "Collection should have dynamic fields enabled"
 
     staged_filepath = stager.run(
         elements_filepath=upload_file_with_embeddings,
@@ -406,9 +394,9 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
 
         # Verify filename is specifically stored if present
         if "filename" in stored_metadata:
-            assert (
-                sample_result["filename"] == upload_file.name
-            ), "Filename should be correctly stored"
+            assert sample_result["filename"] == upload_file.name, (
+                "Filename should be correctly stored"
+            )
 
 
 @pytest.mark.asyncio
@@ -422,9 +410,7 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
     logger.debug("\n--- Running test_milvus_metadata_filtering_without_dynamic_fields ---")
     upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
     file_data = FileData(
-        source_identifiers=SourceIdentifiers(
-            fullpath=str(upload_file), filename=upload_file.stem
-        ),
+        source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
         connector_type=CONNECTOR_TYPE,
         identifier="no_dynamic_test_file",
     )
@@ -438,9 +424,9 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
     )
 
     # Verify dynamic fields are NOT enabled
-    assert (
-        not uploader.has_dynamic_fields_enabled()
-    ), "Collection should NOT have dynamic fields enabled"
+    assert not uploader.has_dynamic_fields_enabled(), (
+        "Collection should NOT have dynamic fields enabled"
+    )
 
     staged_filepath = stager.run(
         elements_filepath=upload_file_with_embeddings,
@@ -487,8 +473,8 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
 
         # Verify that only core fields are present (no metadata fields)
         sample_result = results[0]
-        core_fields = {'id', 'record_id', 'embeddings'}
-        
+        core_fields = {"id", "record_id", "embeddings"}
+
         # The result should only contain the fields defined in the schema
         assert set(sample_result.keys()) == core_fields, (
             "Unexpected fields found in collection with dynamic fields disabled. "
@@ -504,9 +490,9 @@ def test_dynamic_fields_detection(collection: str):
         connection_config=MilvusConnectionConfig(uri=DB_URI),
         upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
     )
-    assert (
-        uploader_with_dynamic.has_dynamic_fields_enabled()
-    ), "Should detect dynamic fields are enabled"
+    assert uploader_with_dynamic.has_dynamic_fields_enabled(), (
+        "Should detect dynamic fields are enabled"
+    )
 
     # Test with dynamic fields disabled
     uploader_without_dynamic = MilvusUploader(
@@ -515,9 +501,9 @@ def test_dynamic_fields_detection(collection: str):
             db_name=DB_NAME, collection_name=COLLECTION_WITHOUT_DYNAMIC_FIELDS
         ),
     )
-    assert (
-        not uploader_without_dynamic.has_dynamic_fields_enabled()
-    ), "Should detect dynamic fields are disabled"
+    assert not uploader_without_dynamic.has_dynamic_fields_enabled(), (
+        "Should detect dynamic fields are disabled"
+    )
 
 
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
@@ -548,9 +534,7 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
 def test_precheck_fails_on_nonexisting_db(collection: str):
     uploader = MilvusUploader(
         connection_config=MilvusConnectionConfig(uri=DB_URI),
-        upload_config=MilvusUploaderConfig(
-            db_name="nonexisting_db", collection_name=collection
-        ),
+        upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
     )
     with pytest.raises(
         DestinationConnectionError,
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "1.0.39"  # pragma: no cover
+__version__ = "1.0.40"  # pragma: no cover
diff --git a/unstructured_ingest/processes/connectors/confluence.py b/unstructured_ingest/processes/connectors/confluence.py
@@ -33,6 +33,8 @@
 
 if TYPE_CHECKING:
     from atlassian import Confluence
+    from bs4 import BeautifulSoup
+    from bs4.element import Tag
 
 CONNECTOR_TYPE = "confluence"
 
@@ -235,11 +237,28 @@ def run(self) -> Generator[FileData, None, None]:
                 yield file_data
 
 
-class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
+class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
     max_num_metadata_permissions: int = Field(
         250, description="Approximate maximum number of permissions included in metadata"
     )
 
+    @requires_dependencies(["bs4"])
+    def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
+        from bs4.element import Tag
+
+        return [
+            element
+            for element in html_soup.find_all(
+                "a",
+                attrs={
+                    "class": "confluence-embedded-file",
+                    "data-linked-resource-type": "attachment",
+                    "href": True,
+                },
+            )
+            if isinstance(element, Tag)
+        ]
+
 
 @dataclass
 class ConfluenceDownloader(Downloader):
diff --git a/unstructured_ingest/processes/connectors/milvus.py b/unstructured_ingest/processes/connectors/milvus.py
@@ -52,9 +52,7 @@ def get_connection_kwargs(self) -> dict[str, Any]:
         connection_config_dict.pop("access_config", None)
         connection_config_dict.update(access_config_dict)
         # Drop any that were not set explicitly
-        connection_config_dict = {
-            k: v for k, v in connection_config_dict.items() if v is not None
-        }
+        connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
         return connection_config_dict
 
     @requires_dependencies(["pymilvus"], extras="milvus")
@@ -142,9 +140,7 @@ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
                 )
         for json_dumps_field in json_dumps_fields:
             if json_dumps_field in working_data:
-                working_data[json_dumps_field] = json.dumps(
-                    working_data[json_dumps_field]
-                )
+                working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
         working_data[RECORD_ID_LABEL] = file_data.identifier
         return working_data
 
@@ -168,9 +164,7 @@ def has_dynamic_fields_enabled(self) -> bool:
         """Check if the target collection has dynamic fields enabled."""
         try:
             with self.get_client() as client:
-                collection_info = client.describe_collection(
-                    self.upload_config.collection_name
-                )
+                collection_info = client.describe_collection(self.upload_config.collection_name)
 
                 # Check if dynamic field is enabled
                 # The schema info should contain enable_dynamic_field or enableDynamicField
@@ -180,9 +174,7 @@ def has_dynamic_fields_enabled(self) -> bool:
                 )
                 return bool(schema_info)
         except Exception as e:
-            logger.warning(
-                f"Could not determine if collection has dynamic fields enabled: {e}"
-            )
+            logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
             return False
 
     @DestinationConnectionError.wrap
@@ -214,9 +206,7 @@ def delete_by_record_id(self, file_data: FileData) -> None:
             f"from milvus collection {self.upload_config.collection_name}"
         )
         with self.get_client() as client:
-            delete_filter = (
-                f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
-            )
+            delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
             resp = client.delete(
                 collection_name=self.upload_config.collection_name, filter=delete_filter
             )
@@ -233,7 +223,7 @@ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
         - If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
         - If dynamic fields are disabled, it filters out any fields not present in the schema.
         """
-        
+
         dynamic_fields_enabled = self.has_dynamic_fields_enabled()
 
         # If dynamic fields are enabled, 'languages' field needs to be a list
@@ -267,9 +257,7 @@ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
         # Remove metadata fields that are not part of the base schema
         filtered_data = []
         for item in data:
-            filtered_item = {
-                key: value for key, value in item.items() if key in schema_fields
-            }
+            filtered_item = {key: value for key, value in item.items() if key in schema_fields}
             filtered_data.append(filtered_item)
         return filtered_data
 
@@ -293,11 +281,7 @@ def insert_results(self, data: list[dict]):
                 raise WriteError(
                     f"failed to upload records to Milvus: {str(milvus_exception.message)}"
                 ) from milvus_exception
-            if (
-                "err_count" in res
-                and isinstance(res["err_count"], int)
-                and res["err_count"] > 0
-            ):
+            if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
                 err_count = res["err_count"]
                 raise WriteError(f"failed to upload {err_count} docs")
 
diff --git a/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py b/unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py
@@ -19,11 +19,11 @@ def can_have_children() -> bool:
     @classmethod
     def from_dict(cls, data: dict):
         """Create OriginalSyncedBlock from dictionary data.
-        
+
         Original blocks contain children content.
         """
         if "children" not in data:
-             raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
+            raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
         return cls(children=data["children"])
 
     def get_html(self) -> Optional[HtmlTag]:
@@ -38,7 +38,7 @@ class DuplicateSyncedBlock(BlockBase):
     @staticmethod
     def can_have_children() -> bool:
         """Check if duplicate synced blocks can have children.
-        
+
         Duplicate blocks themselves don't have children directly fetched here,
         but they represent content that does, so Notion API might report has_children=True
         on the parent block object. The actual children are fetched from the original block.
@@ -48,7 +48,7 @@ def can_have_children() -> bool:
     @classmethod
     def from_dict(cls, data: dict):
         """Create DuplicateSyncedBlock from dictionary data.
-        
+
         Duplicate blocks contain a 'synced_from' reference.
         """
         synced_from_data = data.get("synced_from")
@@ -63,7 +63,7 @@ def from_dict(cls, data: dict):
 
     def get_html(self) -> Optional[HtmlTag]:
         """Get HTML representation of the duplicate synced block.
-        
+
         HTML representation might need fetching the original block's content,
         which is outside the scope of this simple data class.
         """
@@ -74,15 +74,15 @@ class SyncBlock(BlockBase):
     @staticmethod
     def can_have_children() -> bool:
         """Check if synced blocks can have children.
-        
+
         Synced blocks (both original and duplicate) can conceptually have children.
         """
         return True
 
     @classmethod
     def from_dict(cls, data: dict):
         """Create appropriate SyncedBlock subclass from dictionary data.
-        
+
         Determine if it's a duplicate (has 'synced_from') or original (has 'children').
         """
         if data.get("synced_from") is not None:
@@ -99,10 +99,9 @@ def from_dict(cls, data: dict):
             # Consider logging a warning here if strictness is needed.
             return OriginalSyncedBlock(children=[])
 
-
     def get_html(self) -> Optional[HtmlTag]:
         """Get HTML representation of the synced block.
-        
+
         The specific instance returned by from_dict (Original or Duplicate)
         will handle its own get_html logic.
         This method on the base SyncBlock might not be directly called.
diff --git a/unstructured_ingest/utils/html.py b/unstructured_ingest/utils/html.py
@@ -12,6 +12,7 @@
 from unstructured_ingest.utils.dep_check import requires_dependencies
 
 if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
     from bs4.element import Tag
     from requests import Session
 
@@ -96,7 +97,7 @@ def get_hrefs(self, url: str, html: str) -> list:
         from bs4 import BeautifulSoup
 
         soup = BeautifulSoup(html, "html.parser")
-        tags = soup.find_all("a", href=True)
+        tags = self._find_hyperlink_tags(soup)
         hrefs = [
             tag["href"]
             for tag in tags
@@ -158,3 +159,15 @@ def extract_embedded_files(
             )
             for url_to_download in urls_to_download
         ]
+
+    @requires_dependencies(["bs4"])
+    def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
+        """Find hyperlink tags in the HTML.
+
+        Overwrite this method to customize the tag search.
+        """
+        from bs4.element import Tag
+
+        return [
+            element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
+        ]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+export ZENDESK_TOKEN=MjdxjP8ffGEorZfUUOOdyiNlyhMYrf1KwvHucCez`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.0.39" # pragma: no cover`
	`1`	`+__version__ = "1.0.40" # pragma: no cover`