fix: confluence page limits (#536)

ryannikolaidis · web-flow · commit 0ed1433f5a02 · 2025-06-23T14:21:52.000-07:00
The limit argument supported by the confluence client is not limiting
the pages as advertised. This adds a limit to the returned results.

Integration test was updated to a parameterized pattern to reduce
duplication and include an additional test that validates the doc limit.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 1.0.43
+
+* **Fix document limits in Confluence connectr**
+
 ## 1.0.42
 
 * **Replace no longer supported TogetherAI test model**
diff --git a/test/integration/connectors/expected_results/confluence_limit/directory_structure.json b/test/integration/connectors/expected_results/confluence_limit/directory_structure.json
@@ -0,0 +1,5 @@
+{
+  "directory_structure": [
+    "testteamsp/1605859.html"
+  ]
+}
diff --git a/test/integration/connectors/expected_results/confluence_limit/downloads/testteamsp/1605859.html b/test/integration/connectors/expected_results/confluence_limit/downloads/testteamsp/1605859.html
diff --git a/test/integration/connectors/expected_results/confluence_limit/file_data/1605859.json b/test/integration/connectors/expected_results/confluence_limit/file_data/1605859.json
@@ -0,0 +1,58 @@
+{
+  "identifier": "1605859",
+  "connector_type": "confluence",
+  "source_identifiers": {
+    "filename": "1605859.html",
+    "fullpath": "testteamsp/1605859.html",
+    "rel_path": "testteamsp/1605859.html"
+  },
+  "metadata": {
+    "url": "https://unstructured-ingest-test.atlassian.net/pages/1605859",
+    "version": "2",
+    "record_locator": {
+      "space_id": "testteamsp",
+      "document_id": "1605859"
+    },
+    "date_created": "2023-07-09T12:54:40.304Z",
+    "date_modified": "2023-07-13T14:13:27.275Z",
+    "date_processed": "1744231490.650383",
+    "permissions_data": [
+      {
+        "read": {
+          "users": [
+            "712020:5368eedf-cecd-43e1-8b25-b2221316ee6f"
+          ],
+          "groups": [
+            "5d476b78-504f-47d2-bddb-aa45ebf77753",
+            "78cd6a04-9161-4cf9-9e96-1fd605961fc0"
+          ]
+        }
+      },
+      {
+        "update": {
+          "users": [
+            "712020:5368eedf-cecd-43e1-8b25-b2221316ee6f"
+          ],
+          "groups": [
+            "78cd6a04-9161-4cf9-9e96-1fd605961fc0"
+          ]
+        }
+      },
+      {
+        "delete": {
+          "users": [],
+          "groups": []
+        }
+      }
+    ],
+    "filesize_bytes": null
+  },
+  "additional_metadata": {
+    "space_key": "testteamsp",
+    "space_id": 1605649,
+    "document_id": "1605859"
+  },
+  "reprocess": false,
+  "local_download_path": "/private/var/folders/h7/n848df9s5yn7ml8rxb61vhyc0000gp/T/tmp8s4u1xht/testteamsp/1605859.html",
+  "display_name": "test-teamspace"
+}
diff --git a/test/integration/connectors/test_confluence.py b/test/integration/connectors/test_confluence.py
@@ -6,7 +6,6 @@
 from test.integration.connectors.utils.validation.source import (
     SourceValidationConfigs,
     source_connector_validation,
-    source_filedata_display_name_set_check,
 )
 from test.integration.utils import requires_env
 from unstructured_ingest.processes.connectors.confluence import (
@@ -23,78 +22,44 @@
 @pytest.mark.asyncio
 @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
 @requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
-async def test_confluence_source(temp_dir):
-    # Retrieve environment variables
-    confluence_url = "https://unstructured-ingest-test.atlassian.net"
-    user_email = os.environ["CONFLUENCE_USER_EMAIL"]
-    api_token = os.environ["CONFLUENCE_API_TOKEN"]
-    spaces = ["testteamsp", "MFS"]
-
-    # Create connection and indexer configurations
-    access_config = ConfluenceAccessConfig(api_token=api_token)
-    connection_config = ConfluenceConnectionConfig(
-        url=confluence_url,
-        username=user_email,
-        access_config=access_config,
-    )
-    index_config = ConfluenceIndexerConfig(
-        max_num_of_spaces=500,
-        max_num_of_docs_from_each_space=100,
-        spaces=spaces,
-    )
-
-    download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
-
-    # Instantiate indexer and downloader
-    indexer = ConfluenceIndexer(
-        connection_config=connection_config,
-        index_config=index_config,
-    )
-    downloader = ConfluenceDownloader(
-        connection_config=connection_config,
-        download_config=download_config,
-    )
-
-    # Run the source connector validation
-    await source_connector_validation(
-        indexer=indexer,
-        downloader=downloader,
-        configs=SourceValidationConfigs(
-            test_id="confluence",
-            expected_num_files=11,
-            validate_downloaded_files=True,
-            predownload_file_data_check=source_filedata_display_name_set_check,
-            postdownload_file_data_check=source_filedata_display_name_set_check,
-        ),
-    )
-
-
-@pytest.mark.asyncio
-@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
-@requires_env("CONFLUENCE_USER_EMAIL", "CONFLUENCE_API_TOKEN")
-async def test_confluence_source_large(temp_dir):
-    # Retrieve environment variables
+@pytest.mark.parametrize(
+    "spaces,max_num_of_spaces,max_num_of_docs_from_each_space,expected_num_files,validate_downloaded_files,validate_file_data,test_id",
+    [
+        (["testteamsp", "MFS"], 500, 100, 11, True, True, "confluence"),
+        (["testteamsp"], 500, 1, 1, True, True, "confluence_limit"),
+        (["testteamsp1"], 10, 301, 301, False, False, "confluence_large"),
+    ],
+)
+async def test_confluence_source_param(
+    temp_dir,
+    spaces,
+    max_num_of_spaces,
+    max_num_of_docs_from_each_space,
+    expected_num_files,
+    validate_downloaded_files,
+    validate_file_data,
+    test_id,
+):
+    """
+    Integration test for the Confluence source connector using various space and document limits.
+    """
     confluence_url = "https://unstructured-ingest-test.atlassian.net"
     user_email = os.environ["CONFLUENCE_USER_EMAIL"]
     api_token = os.environ["CONFLUENCE_API_TOKEN"]
-    spaces = ["testteamsp1"]
 
-    # Create connection and indexer configurations
     access_config = ConfluenceAccessConfig(api_token=api_token)
     connection_config = ConfluenceConnectionConfig(
         url=confluence_url,
         username=user_email,
         access_config=access_config,
     )
     index_config = ConfluenceIndexerConfig(
-        max_num_of_spaces=10,
-        max_num_of_docs_from_each_space=250,
+        max_num_of_spaces=max_num_of_spaces,
+        max_num_of_docs_from_each_space=max_num_of_docs_from_each_space,
         spaces=spaces,
     )
-
     download_config = ConfluenceDownloaderConfig(download_dir=temp_dir)
 
-    # Instantiate indexer and downloader
     indexer = ConfluenceIndexer(
         connection_config=connection_config,
         index_config=index_config,
@@ -104,15 +69,13 @@ async def test_confluence_source_large(temp_dir):
         download_config=download_config,
     )
 
-    # Run the source connector validation
     await source_connector_validation(
         indexer=indexer,
         downloader=downloader,
         configs=SourceValidationConfigs(
-            test_id="confluence_large",
-            expected_num_files=301,
-            validate_file_data=False,
-            predownload_file_data_check=source_filedata_display_name_set_check,
-            postdownload_file_data_check=source_filedata_display_name_set_check,
+            test_id=test_id,
+            expected_num_files=expected_num_files,
+            validate_downloaded_files=validate_downloaded_files,
+            validate_file_data=validate_file_data
         ),
-    )
+    )
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "1.0.42"  # pragma: no cover
+__version__ = "1.0.43"  # pragma: no cover
diff --git a/unstructured_ingest/processes/connectors/confluence.py b/unstructured_ingest/processes/connectors/confluence.py
@@ -186,12 +186,15 @@ def _get_docs_ids_within_one_space(self, space_key: str) -> List[dict]:
             pages = client.get_all_pages_from_space(
                 space=space_key,
                 start=0,
-                limit=self.index_config.max_num_of_docs_from_each_space,
                 expand=None,
                 content_type="page",  # blogpost and comment types not currently supported
                 status=None,
             )
-        doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in pages]
+        # Limit the number of documents to max_num_of_docs_from_each_space
+        # Note: this is needed because the limit field in client.get_all_pages_from_space does 
+        # not seem to work as expected
+        limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
+        doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
         return doc_ids
 
     def run(self) -> Generator[FileData, None, None]:

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +{
 +  "directory_structure": [
 +    "testteamsp/1605859.html"
 +  ]
 +}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.0.42" # pragma: no cover`
	`1`	`+__version__ = "1.0.43" # pragma: no cover`