Skip to content

Commit 7c6884d

Browse files
ryannikolaidisraiderrobertFilip Knefel
authored
feat: check all user specified spaces for confluence source (#523)
Improve `precheck` method of Confluence's Indexer. Validate that each space provided in configuration can be accessed, raise exception if at least one of them can't. --------- Co-authored-by: Rob Roskam <[email protected]> Co-authored-by: Filip Knefel <[email protected]>
1 parent f5d93be commit 7c6884d

File tree

5 files changed

+3875
-3759
lines changed

5 files changed

+3875
-3759
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.34-dev
2+
3+
* **Improve Confluence Indexer's precheck** - validate access to each space
4+
15
## 1.0.33
26

37
* **Fix google drive not setting the display_name property on the FileData object**

test/unit/connectors/test_confluence.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,27 @@
1+
from unittest import mock
2+
13
import pytest
24
from pydantic import ValidationError
35

46
from unstructured_ingest.processes.connectors.confluence import (
57
ConfluenceAccessConfig,
68
ConfluenceConnectionConfig,
9+
ConfluenceIndexer,
10+
ConfluenceIndexerConfig,
711
)
812

913

14+
@pytest.fixture
15+
def connection_config():
16+
"""Provides a minimal ConfluenceConnectionConfig for testing."""
17+
access_config = ConfluenceAccessConfig(api_token="token")
18+
return ConfluenceConnectionConfig(
19+
url="https://dummy",
20+
username="user",
21+
access_config=access_config,
22+
)
23+
24+
1025
def test_connection_config_multiple_auth():
1126
with pytest.raises(ValidationError):
1227
ConfluenceConnectionConfig(
@@ -69,3 +84,80 @@ def test_connection_config_pat_auth():
6984
access_config=ConfluenceAccessConfig(token="access_token"),
7085
url="url",
7186
)
87+
88+
89+
def test_precheck_with_spaces_calls_get_space(monkeypatch, connection_config):
90+
"""Test that precheck calls get_space for each space when spaces are set."""
91+
spaces = ["A", "B", "C"]
92+
index_config = ConfluenceIndexerConfig(
93+
max_num_of_spaces=100,
94+
max_num_of_docs_from_each_space=100,
95+
spaces=spaces,
96+
)
97+
indexer = ConfluenceIndexer(connection_config=connection_config, index_config=index_config)
98+
mock_client = mock.MagicMock()
99+
with mock.patch.object(type(connection_config), "get_client", mock.MagicMock()):
100+
type(connection_config).get_client.return_value.__enter__.return_value = mock_client
101+
102+
result = indexer.precheck()
103+
calls = [mock.call(space) for space in spaces]
104+
mock_client.get_space.assert_has_calls(calls, any_order=False)
105+
assert mock_client.get_space.call_count == len(spaces)
106+
assert result is True
107+
108+
109+
def test_precheck_without_spaces_calls_get_all_spaces(monkeypatch, connection_config):
110+
"""Test that precheck calls get_all_spaces when spaces is not set."""
111+
index_config = ConfluenceIndexerConfig(
112+
max_num_of_spaces=100,
113+
max_num_of_docs_from_each_space=100,
114+
spaces=None,
115+
)
116+
indexer = ConfluenceIndexer(connection_config=connection_config, index_config=index_config)
117+
mock_client = mock.MagicMock()
118+
with mock.patch.object(type(connection_config), "get_client", mock.MagicMock()):
119+
type(connection_config).get_client.return_value.__enter__.return_value = mock_client
120+
121+
result = indexer.precheck()
122+
mock_client.get_all_spaces.assert_called_once_with(limit=1)
123+
mock_client.get_space.assert_not_called()
124+
assert result is True
125+
126+
127+
def test_precheck_with_spaces_raises(monkeypatch, connection_config):
128+
"""Test that precheck raises UserError if get_space fails."""
129+
spaces = ["A", "B"]
130+
index_config = ConfluenceIndexerConfig(
131+
max_num_of_spaces=100,
132+
max_num_of_docs_from_each_space=100,
133+
spaces=spaces,
134+
)
135+
indexer = ConfluenceIndexer(connection_config=connection_config, index_config=index_config)
136+
mock_client = mock.MagicMock()
137+
mock_client.get_space.side_effect = Exception("fail")
138+
from unstructured_ingest.processes.connectors.confluence import UserError
139+
140+
with mock.patch.object(type(connection_config), "get_client", mock.MagicMock()):
141+
type(connection_config).get_client.return_value.__enter__.return_value = mock_client
142+
143+
with pytest.raises(UserError):
144+
indexer.precheck()
145+
146+
147+
def test_precheck_without_spaces_raises(monkeypatch, connection_config):
148+
"""Test that precheck raises SourceConnectionError if get_all_spaces fails."""
149+
index_config = ConfluenceIndexerConfig(
150+
max_num_of_spaces=100,
151+
max_num_of_docs_from_each_space=100,
152+
spaces=None,
153+
)
154+
indexer = ConfluenceIndexer(connection_config=connection_config, index_config=index_config)
155+
mock_client = mock.MagicMock()
156+
mock_client.get_all_spaces.side_effect = Exception("fail")
157+
from unstructured_ingest.processes.connectors.confluence import UserError
158+
159+
with mock.patch.object(type(connection_config), "get_client", mock.MagicMock()):
160+
type(connection_config).get_client.return_value.__enter__.return_value = mock_client
161+
162+
with pytest.raises(UserError):
163+
indexer.precheck()

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.33" # pragma: no cover
1+
__version__ = "1.0.34-dev" # pragma: no cover

unstructured_ingest/processes/connectors/confluence.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
SourceIdentifiers,
1313
)
1414
from unstructured_ingest.error import SourceConnectionError
15+
from unstructured_ingest.errors_v2 import UserAuthError, UserError
1516
from unstructured_ingest.interfaces import (
1617
AccessConfig,
1718
ConnectionConfig,
@@ -96,7 +97,7 @@ def password_or_api_token(self) -> str:
9697

9798
@requires_dependencies(["atlassian"], extras="confluence")
9899
@contextmanager
99-
def get_client(self) -> "Confluence":
100+
def get_client(self) -> Generator["Confluence", None, None]:
100101
from atlassian import Confluence
101102

102103
access_configs = self.access_config.get_secret_value()
@@ -126,15 +127,36 @@ class ConfluenceIndexer(Indexer):
126127

127128
def precheck(self) -> bool:
128129
try:
129-
# Attempt to retrieve a list of spaces with limit=1.
130-
# This should only succeed if all creds are valid
131-
with self.connection_config.get_client() as client:
130+
self.connection_config.get_client()
131+
except Exception as e:
132+
logger.exception(f"Failed to connect to Confluence: {e}")
133+
raise UserAuthError(f"Failed to connect to Confluence: {e}")
134+
135+
with self.connection_config.get_client() as client:
136+
# opportunistically check the first space in list of all spaces
137+
try:
132138
client.get_all_spaces(limit=1)
139+
except Exception as e:
140+
logger.exception(f"Failed to connect to find any Confluence space: {e}")
141+
raise UserError(f"Failed to connect to find any Confluence space: {e}")
142+
133143
logger.info("Connection to Confluence successful.")
134-
return True
135-
except Exception as e:
136-
logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
137-
raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
144+
145+
# If specific spaces are provided, check if we can access them
146+
errors = []
147+
148+
if self.index_config.spaces:
149+
for space_key in self.index_config.spaces:
150+
try:
151+
client.get_space(space_key)
152+
except Exception as e:
153+
logger.exception(f"Failed to connect to Confluence: {e}")
154+
errors.append(f"Failed to connect to '{space_key}' space, cause: '{e}'")
155+
156+
if errors:
157+
raise UserError("\n".join(errors))
158+
159+
return True
138160

139161
def _get_space_ids_and_keys(self) -> List[Tuple[str, int]]:
140162
"""
@@ -406,7 +428,7 @@ def run(self, file_data: FileData, **kwargs) -> download_responses:
406428
expand="history.lastUpdated,version,body.view",
407429
)
408430
except Exception as e:
409-
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
431+
logger.exception(f"Failed to retrieve page with ID {doc_id}: {e}")
410432
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
411433

412434
if not page:

0 commit comments

Comments
 (0)