Skip to content

Commit b5d3268

Browse files
ds-filipknefelFilip Knefel
andauthored
fix: search only for valid attachment links in Confluence page (#535)
Modify the `HtmlMixin` to allow for customized hyperlink tag search in subclasses using the mixin by overwriting a dedicated method. Overwrite said method in the `ConfluenceDownloaderConfig` to target only embedded attachment files when running with `extract_files` flag. --------- Co-authored-by: Filip Knefel <[email protected]>
1 parent 65b7d8c commit b5d3268

File tree

9 files changed

+3818
-3812
lines changed

9 files changed

+3818
-3812
lines changed

.envrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export ZENDESK_TOKEN=MjdxjP8ffGEorZfUUOOdyiNlyhMYrf1KwvHucCez

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.40
2+
3+
* **Fix extracting embedded files from Confluence pages**
4+
15
## 1.0.39
26

37
* **Added metadata export to milvus destination connector**

test/integration/connectors/test_milvus.py

Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,8 @@ def get_schema(enable_dynamic_field: bool = True) -> CollectionSchema:
9090
is_primary=True,
9191
auto_id=True,
9292
)
93-
embeddings_field = FieldSchema(
94-
name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384
95-
)
96-
record_id_field = FieldSchema(
97-
name="record_id", dtype=DataType.VARCHAR, max_length=64
98-
)
93+
embeddings_field = FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384)
94+
record_id_field = FieldSchema(name="record_id", dtype=DataType.VARCHAR, max_length=64)
9995

10096
schema = CollectionSchema(
10197
enable_dynamic_field=enable_dynamic_field,
@@ -111,9 +107,7 @@ def get_schema(enable_dynamic_field: bool = True) -> CollectionSchema:
111107

112108
def get_index_params() -> IndexParams:
113109
index_params = IndexParams()
114-
index_params.add_index(
115-
field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE"
116-
)
110+
index_params.add_index(field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE")
117111
index_params.add_index(field_name="record_id", index_type="Trie")
118112
return index_params
119113

@@ -253,9 +247,7 @@ async def test_milvus_destination(
253247
logger.debug("\n--- Running test_milvus_destination ---")
254248
upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
255249
file_data = FileData(
256-
source_identifiers=SourceIdentifiers(
257-
fullpath=str(upload_file), filename=upload_file.stem
258-
),
250+
source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
259251
connector_type=CONNECTOR_TYPE,
260252
identifier="mock file data",
261253
)
@@ -321,9 +313,7 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
321313
logger.debug("\n--- Running test_milvus_metadata_storage_with_dynamic_fields ---")
322314
upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
323315
file_data = FileData(
324-
source_identifiers=SourceIdentifiers(
325-
fullpath=str(upload_file), filename=upload_file.stem
326-
),
316+
source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
327317
connector_type=CONNECTOR_TYPE,
328318
identifier="metadata_test_file",
329319
)
@@ -335,9 +325,7 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
335325
)
336326

337327
# Verify dynamic fields are enabled
338-
assert (
339-
uploader.has_dynamic_fields_enabled()
340-
), "Collection should have dynamic fields enabled"
328+
assert uploader.has_dynamic_fields_enabled(), "Collection should have dynamic fields enabled"
341329

342330
staged_filepath = stager.run(
343331
elements_filepath=upload_file_with_embeddings,
@@ -406,9 +394,9 @@ async def test_milvus_metadata_storage_with_dynamic_fields(
406394

407395
# Verify filename is specifically stored if present
408396
if "filename" in stored_metadata:
409-
assert (
410-
sample_result["filename"] == upload_file.name
411-
), "Filename should be correctly stored"
397+
assert sample_result["filename"] == upload_file.name, (
398+
"Filename should be correctly stored"
399+
)
412400

413401

414402
@pytest.mark.asyncio
@@ -422,9 +410,7 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
422410
logger.debug("\n--- Running test_milvus_metadata_filtering_without_dynamic_fields ---")
423411
upload_file_with_embeddings = add_fake_embeddings(upload_file, tmp_path)
424412
file_data = FileData(
425-
source_identifiers=SourceIdentifiers(
426-
fullpath=str(upload_file), filename=upload_file.stem
427-
),
413+
source_identifiers=SourceIdentifiers(fullpath=str(upload_file), filename=upload_file.stem),
428414
connector_type=CONNECTOR_TYPE,
429415
identifier="no_dynamic_test_file",
430416
)
@@ -438,9 +424,9 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
438424
)
439425

440426
# Verify dynamic fields are NOT enabled
441-
assert (
442-
not uploader.has_dynamic_fields_enabled()
443-
), "Collection should NOT have dynamic fields enabled"
427+
assert not uploader.has_dynamic_fields_enabled(), (
428+
"Collection should NOT have dynamic fields enabled"
429+
)
444430

445431
staged_filepath = stager.run(
446432
elements_filepath=upload_file_with_embeddings,
@@ -487,8 +473,8 @@ async def test_milvus_metadata_filtering_without_dynamic_fields(
487473

488474
# Verify that only core fields are present (no metadata fields)
489475
sample_result = results[0]
490-
core_fields = {'id', 'record_id', 'embeddings'}
491-
476+
core_fields = {"id", "record_id", "embeddings"}
477+
492478
# The result should only contain the fields defined in the schema
493479
assert set(sample_result.keys()) == core_fields, (
494480
"Unexpected fields found in collection with dynamic fields disabled. "
@@ -504,9 +490,9 @@ def test_dynamic_fields_detection(collection: str):
504490
connection_config=MilvusConnectionConfig(uri=DB_URI),
505491
upload_config=MilvusUploaderConfig(db_name=DB_NAME, collection_name=collection),
506492
)
507-
assert (
508-
uploader_with_dynamic.has_dynamic_fields_enabled()
509-
), "Should detect dynamic fields are enabled"
493+
assert uploader_with_dynamic.has_dynamic_fields_enabled(), (
494+
"Should detect dynamic fields are enabled"
495+
)
510496

511497
# Test with dynamic fields disabled
512498
uploader_without_dynamic = MilvusUploader(
@@ -515,9 +501,9 @@ def test_dynamic_fields_detection(collection: str):
515501
db_name=DB_NAME, collection_name=COLLECTION_WITHOUT_DYNAMIC_FIELDS
516502
),
517503
)
518-
assert (
519-
not uploader_without_dynamic.has_dynamic_fields_enabled()
520-
), "Should detect dynamic fields are disabled"
504+
assert not uploader_without_dynamic.has_dynamic_fields_enabled(), (
505+
"Should detect dynamic fields are disabled"
506+
)
521507

522508

523509
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
@@ -548,9 +534,7 @@ def test_precheck_fails_on_nonexistent_collection(collection: str):
548534
def test_precheck_fails_on_nonexisting_db(collection: str):
549535
uploader = MilvusUploader(
550536
connection_config=MilvusConnectionConfig(uri=DB_URI),
551-
upload_config=MilvusUploaderConfig(
552-
db_name="nonexisting_db", collection_name=collection
553-
),
537+
upload_config=MilvusUploaderConfig(db_name="nonexisting_db", collection_name=collection),
554538
)
555539
with pytest.raises(
556540
DestinationConnectionError,

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.39" # pragma: no cover
1+
__version__ = "1.0.40" # pragma: no cover

unstructured_ingest/processes/connectors/confluence.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333

3434
if TYPE_CHECKING:
3535
from atlassian import Confluence
36+
from bs4 import BeautifulSoup
37+
from bs4.element import Tag
3638

3739
CONNECTOR_TYPE = "confluence"
3840

@@ -235,11 +237,28 @@ def run(self) -> Generator[FileData, None, None]:
235237
yield file_data
236238

237239

238-
class ConfluenceDownloaderConfig(DownloaderConfig, HtmlMixin):
240+
class ConfluenceDownloaderConfig(HtmlMixin, DownloaderConfig):
239241
max_num_metadata_permissions: int = Field(
240242
250, description="Approximate maximum number of permissions included in metadata"
241243
)
242244

245+
@requires_dependencies(["bs4"])
246+
def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
247+
from bs4.element import Tag
248+
249+
return [
250+
element
251+
for element in html_soup.find_all(
252+
"a",
253+
attrs={
254+
"class": "confluence-embedded-file",
255+
"data-linked-resource-type": "attachment",
256+
"href": True,
257+
},
258+
)
259+
if isinstance(element, Tag)
260+
]
261+
243262

244263
@dataclass
245264
class ConfluenceDownloader(Downloader):

unstructured_ingest/processes/connectors/milvus.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,7 @@ def get_connection_kwargs(self) -> dict[str, Any]:
5252
connection_config_dict.pop("access_config", None)
5353
connection_config_dict.update(access_config_dict)
5454
# Drop any that were not set explicitly
55-
connection_config_dict = {
56-
k: v for k, v in connection_config_dict.items() if v is not None
57-
}
55+
connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
5856
return connection_config_dict
5957

6058
@requires_dependencies(["pymilvus"], extras="milvus")
@@ -142,9 +140,7 @@ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
142140
)
143141
for json_dumps_field in json_dumps_fields:
144142
if json_dumps_field in working_data:
145-
working_data[json_dumps_field] = json.dumps(
146-
working_data[json_dumps_field]
147-
)
143+
working_data[json_dumps_field] = json.dumps(working_data[json_dumps_field])
148144
working_data[RECORD_ID_LABEL] = file_data.identifier
149145
return working_data
150146

@@ -168,9 +164,7 @@ def has_dynamic_fields_enabled(self) -> bool:
168164
"""Check if the target collection has dynamic fields enabled."""
169165
try:
170166
with self.get_client() as client:
171-
collection_info = client.describe_collection(
172-
self.upload_config.collection_name
173-
)
167+
collection_info = client.describe_collection(self.upload_config.collection_name)
174168

175169
# Check if dynamic field is enabled
176170
# The schema info should contain enable_dynamic_field or enableDynamicField
@@ -180,9 +174,7 @@ def has_dynamic_fields_enabled(self) -> bool:
180174
)
181175
return bool(schema_info)
182176
except Exception as e:
183-
logger.warning(
184-
f"Could not determine if collection has dynamic fields enabled: {e}"
185-
)
177+
logger.warning(f"Could not determine if collection has dynamic fields enabled: {e}")
186178
return False
187179

188180
@DestinationConnectionError.wrap
@@ -214,9 +206,7 @@ def delete_by_record_id(self, file_data: FileData) -> None:
214206
f"from milvus collection {self.upload_config.collection_name}"
215207
)
216208
with self.get_client() as client:
217-
delete_filter = (
218-
f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
219-
)
209+
delete_filter = f'{self.upload_config.record_id_key} == "{file_data.identifier}"'
220210
resp = client.delete(
221211
collection_name=self.upload_config.collection_name, filter=delete_filter
222212
)
@@ -233,7 +223,7 @@ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
233223
- If dynamic fields are enabled, it ensures JSON-stringified fields are decoded.
234224
- If dynamic fields are disabled, it filters out any fields not present in the schema.
235225
"""
236-
226+
237227
dynamic_fields_enabled = self.has_dynamic_fields_enabled()
238228

239229
# If dynamic fields are enabled, 'languages' field needs to be a list
@@ -267,9 +257,7 @@ def _prepare_data_for_insert(self, data: list[dict]) -> list[dict]:
267257
# Remove metadata fields that are not part of the base schema
268258
filtered_data = []
269259
for item in data:
270-
filtered_item = {
271-
key: value for key, value in item.items() if key in schema_fields
272-
}
260+
filtered_item = {key: value for key, value in item.items() if key in schema_fields}
273261
filtered_data.append(filtered_item)
274262
return filtered_data
275263

@@ -293,11 +281,7 @@ def insert_results(self, data: list[dict]):
293281
raise WriteError(
294282
f"failed to upload records to Milvus: {str(milvus_exception.message)}"
295283
) from milvus_exception
296-
if (
297-
"err_count" in res
298-
and isinstance(res["err_count"], int)
299-
and res["err_count"] > 0
300-
):
284+
if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
301285
err_count = res["err_count"]
302286
raise WriteError(f"failed to upload {err_count} docs")
303287

unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ def can_have_children() -> bool:
1919
@classmethod
2020
def from_dict(cls, data: dict):
2121
"""Create OriginalSyncedBlock from dictionary data.
22-
22+
2323
Original blocks contain children content.
2424
"""
2525
if "children" not in data:
26-
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
26+
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
2727
return cls(children=data["children"])
2828

2929
def get_html(self) -> Optional[HtmlTag]:
@@ -38,7 +38,7 @@ class DuplicateSyncedBlock(BlockBase):
3838
@staticmethod
3939
def can_have_children() -> bool:
4040
"""Check if duplicate synced blocks can have children.
41-
41+
4242
Duplicate blocks themselves don't have children directly fetched here,
4343
but they represent content that does, so Notion API might report has_children=True
4444
on the parent block object. The actual children are fetched from the original block.
@@ -48,7 +48,7 @@ def can_have_children() -> bool:
4848
@classmethod
4949
def from_dict(cls, data: dict):
5050
"""Create DuplicateSyncedBlock from dictionary data.
51-
51+
5252
Duplicate blocks contain a 'synced_from' reference.
5353
"""
5454
synced_from_data = data.get("synced_from")
@@ -63,7 +63,7 @@ def from_dict(cls, data: dict):
6363

6464
def get_html(self) -> Optional[HtmlTag]:
6565
"""Get HTML representation of the duplicate synced block.
66-
66+
6767
HTML representation might need fetching the original block's content,
6868
which is outside the scope of this simple data class.
6969
"""
@@ -74,15 +74,15 @@ class SyncBlock(BlockBase):
7474
@staticmethod
7575
def can_have_children() -> bool:
7676
"""Check if synced blocks can have children.
77-
77+
7878
Synced blocks (both original and duplicate) can conceptually have children.
7979
"""
8080
return True
8181

8282
@classmethod
8383
def from_dict(cls, data: dict):
8484
"""Create appropriate SyncedBlock subclass from dictionary data.
85-
85+
8686
Determine if it's a duplicate (has 'synced_from') or original (has 'children').
8787
"""
8888
if data.get("synced_from") is not None:
@@ -99,10 +99,9 @@ def from_dict(cls, data: dict):
9999
# Consider logging a warning here if strictness is needed.
100100
return OriginalSyncedBlock(children=[])
101101

102-
103102
def get_html(self) -> Optional[HtmlTag]:
104103
"""Get HTML representation of the synced block.
105-
104+
106105
The specific instance returned by from_dict (Original or Duplicate)
107106
will handle its own get_html logic.
108107
This method on the base SyncBlock might not be directly called.

unstructured_ingest/utils/html.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from unstructured_ingest.utils.dep_check import requires_dependencies
1313

1414
if TYPE_CHECKING:
15+
from bs4 import BeautifulSoup
1516
from bs4.element import Tag
1617
from requests import Session
1718

@@ -96,7 +97,7 @@ def get_hrefs(self, url: str, html: str) -> list:
9697
from bs4 import BeautifulSoup
9798

9899
soup = BeautifulSoup(html, "html.parser")
99-
tags = soup.find_all("a", href=True)
100+
tags = self._find_hyperlink_tags(soup)
100101
hrefs = [
101102
tag["href"]
102103
for tag in tags
@@ -158,3 +159,15 @@ def extract_embedded_files(
158159
)
159160
for url_to_download in urls_to_download
160161
]
162+
163+
@requires_dependencies(["bs4"])
164+
def _find_hyperlink_tags(self, html_soup: "BeautifulSoup") -> list["Tag"]:
165+
"""Find hyperlink tags in the HTML.
166+
167+
Overwrite this method to customize the tag search.
168+
"""
169+
from bs4.element import Tag
170+
171+
return [
172+
element for element in html_soup.find_all("a", href=True) if isinstance(element, Tag)
173+
]

0 commit comments

Comments
 (0)