Skip to content

Commit 0ed1433

Browse files
fix: confluence page limits (#536)
The limit argument supported by the confluence client is not limiting the pages as advertised. This adds a limit to the returned results. Integration test was updated to a parameterized pattern to reduce duplication and include an additional test that validates the doc limit.
1 parent ec0714c commit 0ed1433

File tree

7 files changed

+443
-68
lines changed

7 files changed

+443
-68
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.43
2+
3+
* **Fix document limits in Confluence connectr**
4+
15
## 1.0.42
26

37
* **Replace no longer supported TogetherAI test model**
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"directory_structure": [
3+
"testteamsp/1605859.html"
4+
]
5+
}

test/integration/connectors/expected_results/confluence_limit/downloads/testteamsp/1605859.html

Lines changed: 342 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"identifier": "1605859",
3+
"connector_type": "confluence",
4+
"source_identifiers": {
5+
"filename": "1605859.html",
6+
"fullpath": "testteamsp/1605859.html",
7+
"rel_path": "testteamsp/1605859.html"
8+
},
9+
"metadata": {
10+
"url": "https://unstructured-ingest-test.atlassian.net/pages/1605859",
11+
"version": "2",
12+
"record_locator": {
13+
"space_id": "testteamsp",
14+
"document_id": "1605859"
15+
},
16+
"date_created": "2023-07-09T12:54:40.304Z",
17+
"date_modified": "2023-07-13T14:13:27.275Z",
18+
"date_processed": "1744231490.650383",
19+
"permissions_data": [
20+
{
21+
"read": {
22+
"users": [
23+
"712020:5368eedf-cecd-43e1-8b25-b2221316ee6f"
24+
],
25+
"groups": [
26+
"5d476b78-504f-47d2-bddb-aa45ebf77753",
27+
"78cd6a04-9161-4cf9-9e96-1fd605961fc0"
28+
]
29+
}
30+
},
31+
{
32+
"update": {
33+
"users": [
34+
"712020:5368eedf-cecd-43e1-8b25-b2221316ee6f"
35+
],
36+
"groups": [
37+
"78cd6a04-9161-4cf9-9e96-1fd605961fc0"
38+
]
39+
}
40+
},
41+
{
42+
"delete": {
43+
"users": [],
44+
"groups": []
45+
}
46+
}
47+
],
48+
"filesize_bytes": null
49+
},
50+
"additional_metadata": {
51+
"space_key": "testteamsp",
52+
"space_id": 1605649,
53+
"document_id": "1605859"
54+
},
55+
"reprocess": false,
56+
"local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp8s4u1xht/testteamsp/1605859.html",
57+
"display_name": "test-teamspace"
58+
}

test/integration/connectors/test_confluence.py

Lines changed: 28 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from test.integration.connectors.utils.validation.source import (
77
SourceValidationConfigs,
88
source_connector_validation,
9-
source_filedata_display_name_set_check,
109
)
1110
from test.integration.utils import requires_env
1211
from unstructured_ingest.processes.connectors.confluence import (
@@ -23,78 +22,44 @@
2322
@pytest.mark.asyncio
2423
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
2524
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
26-
async def test_confluence_source(temp_dir):
27-
# Retrieve environment variables
28-
confluence_url = "https://unstructured-ingest-test.atlassian.net"
29-
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
30-
api_token = os.environ["CONFLUENCE_API_TOKEN"]
31-
spaces = ["testteamsp", "MFS"]
32-
33-
# Create connection and indexer configurations
34-
access_config = ConfluenceAccessConfig(api_token=api_token)
35-
connection_config = ConfluenceConnectionConfig(
36-
url=confluence_url,
37-
username=user_email,
38-
access_config=access_config,
39-
)
40-
index_config = ConfluenceIndexerConfig(
41-
max_num_of_spaces=500,
42-
max_num_of_docs_from_each_space=100,
43-
spaces=spaces,
44-
)
45-
46-
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
47-
48-
# Instantiate indexer and downloader
49-
indexer = ConfluenceIndexer(
50-
connection_config=connection_config,
51-
index_config=index_config,
52-
)
53-
downloader = ConfluenceDownloader(
54-
connection_config=connection_config,
55-
download_config=download_config,
56-
)
57-
58-
# Run the source connector validation
59-
await source_connector_validation(
60-
indexer=indexer,
61-
downloader=downloader,
62-
configs=SourceValidationConfigs(
63-
test_id="confluence",
64-
expected_num_files=11,
65-
validate_downloaded_files=True,
66-
predownload_file_data_check=source_filedata_display_name_set_check,
67-
postdownload_file_data_check=source_filedata_display_name_set_check,
68-
),
69-
)
70-
71-
72-
@pytest.mark.asyncio
73-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
74-
@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
75-
async def test_confluence_source_large(temp_dir):
76-
# Retrieve environment variables
25+
@pytest.mark.parametrize(
26+
"spaces,max_num_of_spaces,max_num_of_docs_from_each_space,expected_num_files,validate_downloaded_files,validate_file_data,test_id",
27+
[
28+
(["testteamsp", "MFS"], 500, 100, 11, True, True, "confluence"),
29+
(["testteamsp"], 500, 1, 1, True, True, "confluence_limit"),
30+
(["testteamsp1"], 10, 301, 301, False, False, "confluence_large"),
31+
],
32+
)
33+
async def test_confluence_source_param(
34+
temp_dir,
35+
spaces,
36+
max_num_of_spaces,
37+
max_num_of_docs_from_each_space,
38+
expected_num_files,
39+
validate_downloaded_files,
40+
validate_file_data,
41+
test_id,
42+
):
43+
"""
44+
Integration test for the Confluence source connector using various space and document limits.
45+
"""
7746
confluence_url = "https://unstructured-ingest-test.atlassian.net"
7847
user_email = os.environ["CONFLUENCE_USER_EMAIL"]
7948
api_token = os.environ["CONFLUENCE_API_TOKEN"]
80-
spaces = ["testteamsp1"]
8149

82-
# Create connection and indexer configurations
8350
access_config = ConfluenceAccessConfig(api_token=api_token)
8451
connection_config = ConfluenceConnectionConfig(
8552
url=confluence_url,
8653
username=user_email,
8754
access_config=access_config,
8855
)
8956
index_config = ConfluenceIndexerConfig(
90-
max_num_of_spaces=10,
91-
max_num_of_docs_from_each_space=250,
57+
max_num_of_spaces=max_num_of_spaces,
58+
max_num_of_docs_from_each_space=max_num_of_docs_from_each_space,
9259
spaces=spaces,
9360
)
94-
9561
download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
9662

97-
# Instantiate indexer and downloader
9863
indexer = ConfluenceIndexer(
9964
connection_config=connection_config,
10065
index_config=index_config,
@@ -104,15 +69,13 @@ async def test_confluence_source_large(temp_dir):
10469
download_config=download_config,
10570
)
10671

107-
# Run the source connector validation
10872
await source_connector_validation(
10973
indexer=indexer,
11074
downloader=downloader,
11175
configs=SourceValidationConfigs(
112-
test_id="confluence_large",
113-
expected_num_files=301,
114-
validate_file_data=False,
115-
predownload_file_data_check=source_filedata_display_name_set_check,
116-
postdownload_file_data_check=source_filedata_display_name_set_check,
76+
test_id=test_id,
77+
expected_num_files=expected_num_files,
78+
validate_downloaded_files=validate_downloaded_files,
79+
validate_file_data=validate_file_data
11780
),
118-
)
81+
)

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.42" # pragma: no cover
1+
__version__ = "1.0.43" # pragma: no cover

unstructured_ingest/processes/connectors/confluence.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,15 @@ def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
186186
pages = client.get_all_pages_from_space(
187187
space=space_key,
188188
start=0,
189-
limit=self.index_config.max_num_of_docs_from_each_space,
190189
expand=None,
191190
content_type="page", # blogpost and comment types not currently supported
192191
status=None,
193192
)
194-
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
193+
# Limit the number of documents to max_num_of_docs_from_each_space
194+
# Note: this is needed because the limit field in client.get_all_pages_from_space does
195+
# not seem to work as expected
196+
limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
197+
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
195198
return doc_ids
196199

197200
def run(self) -> Generator[FileData, None, None]:

0 commit comments

Comments
 (0)