Skip to content

Commit d9bcf35

Browse files
authored
fix: add precheck method to SharePoint connector (#556)
1 parent 688c1a7 commit d9bcf35

File tree

5 files changed

+230
-11
lines changed

5 files changed

+230
-11
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 1.0.55
2+
3+
* **Fix: add precheck method to SharePoint connector**
4+
5+
16
## 1.0.54
27

38
* **Fix bump Togetherai dependency**

test/integration/connectors/test_sharepoint.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,3 +326,120 @@ async def test_sharepoint_library_with_path(temp_dir):
326326
postdownload_file_data_check=source_filedata_display_name_set_check,
327327
),
328328
)
329+
330+
331+
@pytest.fixture
332+
def base_sharepoint_config():
333+
"""Base SharePoint config for testing."""
334+
config = sharepoint_config()
335+
return {
336+
'client_id': config.client_id,
337+
'tenant': config.tenant,
338+
'site': "https://unstructuredio.sharepoint.com/sites/utic-platform-test-source",
339+
'client_cred': config.client_cred
340+
}
341+
342+
343+
@pytest.fixture
344+
def indexer_factory(base_sharepoint_config):
345+
"""Factory for creating SharePoint indexers with different configs."""
346+
def _create_indexer(client_cred=None, site=None, path=None):
347+
access_config = SharepointAccessConfig(
348+
client_cred=client_cred or base_sharepoint_config['client_cred']
349+
)
350+
connection_config = SharepointConnectionConfig(
351+
client_id=base_sharepoint_config['client_id'],
352+
site=site or base_sharepoint_config['site'],
353+
tenant=base_sharepoint_config['tenant'],
354+
access_config=access_config,
355+
)
356+
index_config = SharepointIndexerConfig(path=path or "")
357+
return SharepointIndexer(
358+
connection_config=connection_config,
359+
index_config=index_config,
360+
)
361+
return _create_indexer
362+
363+
364+
@pytest.mark.asyncio
365+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
366+
@requires_env("SHAREPOINT_CLIENT_ID", "SHAREPOINT_CRED", "MS_TENANT_ID", "MS_USER_PNAME")
367+
@pytest.mark.parametrize("error_scenario,expected_error", [
368+
("invalid_creds", "UserAuthError"),
369+
("nonexistent_site", "UserError"),
370+
("invalid_path", "UserError"),
371+
])
372+
async def test_sharepoint_precheck_error_scenarios(indexer_factory, error_scenario, expected_error):
373+
"""Parametrized test for different SharePoint precheck error scenarios."""
374+
from unstructured_ingest.errors_v2 import UserAuthError, UserError
375+
376+
error_class_map = {
377+
"UserAuthError": UserAuthError,
378+
"UserError": UserError
379+
}
380+
381+
expected_exception = error_class_map[expected_error]
382+
383+
if error_scenario == "invalid_creds":
384+
indexer = indexer_factory(client_cred="invalid_creds")
385+
elif error_scenario == "nonexistent_site":
386+
indexer = indexer_factory(site="https://unstructuredai.sharepoint.com/sites/definitely-does-not-exist-12345")
387+
elif error_scenario == "invalid_path":
388+
indexer = indexer_factory(path="NonExistentFolder/SubFolder/DoesNotExist")
389+
else:
390+
pytest.fail(f"Unknown error scenario: {error_scenario}")
391+
392+
with pytest.raises(expected_exception):
393+
indexer.precheck()
394+
395+
396+
@pytest.fixture
397+
def insufficient_perms_config():
398+
"""Config for testing insufficient permissions."""
399+
from types import SimpleNamespace
400+
401+
required_vars = [
402+
"SHAREPOINT_CLIENT_ID_INSUFFICIENT",
403+
"SHAREPOINT_CRED_INSUFFICIENT",
404+
"MS_TENANT_ID_INSUFFICIENT"
405+
]
406+
407+
missing_vars = [var for var in required_vars if var not in os.environ]
408+
if missing_vars:
409+
pytest.skip(f"Missing environment variables: {missing_vars}")
410+
411+
return SimpleNamespace(
412+
client_id=os.environ["SHAREPOINT_CLIENT_ID_INSUFFICIENT"],
413+
client_cred=os.environ["SHAREPOINT_CRED_INSUFFICIENT"],
414+
tenant=os.environ["MS_TENANT_ID_INSUFFICIENT"]
415+
)
416+
417+
418+
@pytest.mark.asyncio
419+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
420+
@requires_env("MS_USER_PNAME")
421+
async def test_sharepoint_precheck_insufficient_permissions(
422+
insufficient_perms_config,
423+
base_sharepoint_config,
424+
):
425+
"""Test precheck with credentials that have insufficient permissions."""
426+
from unstructured_ingest.errors_v2 import UserAuthError
427+
428+
access_config = SharepointAccessConfig(client_cred=insufficient_perms_config.client_cred)
429+
connection_config = SharepointConnectionConfig(
430+
client_id=insufficient_perms_config.client_id,
431+
site=base_sharepoint_config['site'],
432+
tenant=insufficient_perms_config.tenant,
433+
access_config=access_config,
434+
)
435+
index_config = SharepointIndexerConfig()
436+
437+
indexer = SharepointIndexer(
438+
connection_config=connection_config,
439+
index_config=index_config,
440+
)
441+
442+
with pytest.raises(UserAuthError):
443+
indexer.precheck()
444+
445+

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.54" # pragma: no cover
1+
__version__ = "1.0.55" # pragma: no cover

unstructured_ingest/processes/connectors/onedrive.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
SourceConnectionError,
2121
SourceConnectionNetworkError,
2222
)
23+
from unstructured_ingest.errors_v2 import UserAuthError
2324
from unstructured_ingest.interfaces import (
2425
AccessConfig,
2526
ConnectionConfig,
@@ -114,12 +115,27 @@ def get_token(self):
114115
except ValueError as exc:
115116
logger.error("Couldn't set up credentials.")
116117
raise exc
118+
117119
if "error" in token:
118-
raise SourceConnectionNetworkError(
119-
"failed to fetch token, {}: {}".format(
120-
token["error"], token["error_description"]
120+
error_codes = token.get("error_codes", [])
121+
error_type = token.get("error", "")
122+
error_description = token.get("error_description", "")
123+
124+
# 7000215: Invalid client secret provided
125+
# 7000218: Invalid client id provided
126+
# 700016: Application not found in directory
127+
# 90002: Tenant not found
128+
auth_error_codes = [7000215, 7000218, 700016, 90002]
129+
130+
if (any(code in error_codes for code in auth_error_codes) or
131+
error_type in ["invalid_client", "unauthorized_client", "invalid_grant"]):
132+
raise UserAuthError(
133+
f"Authentication failed: {error_type}: {error_description}"
134+
)
135+
else:
136+
raise SourceConnectionNetworkError(
137+
f"Failed to fetch token: {error_type}: {error_description}"
121138
)
122-
)
123139
return token
124140

125141
@requires_dependencies(["office365"], extras="onedrive")

unstructured_ingest/processes/connectors/sharepoint.py

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
SourceConnectionError,
1414
SourceConnectionNetworkError,
1515
)
16+
from unstructured_ingest.errors_v2 import UserAuthError, UserError
1617
from unstructured_ingest.logger import logger
1718
from unstructured_ingest.processes.connector_registry import (
1819
SourceRegistryEntry,
@@ -30,6 +31,7 @@
3031
if TYPE_CHECKING:
3132
from office365.onedrive.driveitems.driveItem import DriveItem
3233
from office365.onedrive.sites.site import Site
34+
from office365.runtime.client_request_exception import ClientRequestException
3335

3436
CONNECTOR_TYPE = "sharepoint"
3537
LEGACY_DEFAULT_PATH = "Shared Documents"
@@ -82,15 +84,94 @@ def _get_drive_item(self, client_site: Site) -> DriveItem:
8284

8385

8486
class SharepointIndexerConfig(OnedriveIndexerConfig):
85-
pass
86-
87+
# TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
88+
path: str = Field(default="")
8789

8890
@dataclass
8991
class SharepointIndexer(OnedriveIndexer):
9092
connection_config: SharepointConnectionConfig
9193
index_config: SharepointIndexerConfig
9294
connector_type: str = CONNECTOR_TYPE
9395

96+
def _handle_client_request_exception(self, e: ClientRequestException, context: str) -> None:
97+
"""Convert ClientRequestException to appropriate user-facing error based on HTTP status."""
98+
if hasattr(e, "response") and e.response is not None and hasattr(e.response, "status_code"):
99+
status_code = e.response.status_code
100+
if status_code == 401:
101+
raise UserAuthError(
102+
f"Unauthorized access to {context}. Check client credentials and permissions"
103+
)
104+
elif status_code == 403:
105+
raise UserAuthError(
106+
f"Access forbidden to {context}. "
107+
f"Check app permissions (Sites.Read.All required)"
108+
)
109+
elif status_code == 404:
110+
raise UserError(f"Not found: {context}")
111+
112+
raise UserError(f"Failed to access {context}: {str(e)}")
113+
114+
def _is_root_path(self, path: str) -> bool:
115+
"""Check if the path represents root access (empty string or legacy default)."""
116+
return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
117+
118+
def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
119+
"""Get the drive item to search in based on the path."""
120+
if self._is_root_path(path):
121+
return site_drive_item
122+
else:
123+
return site_drive_item.get_by_path(path).get().execute_query()
124+
125+
def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
126+
"""Validate that a specific folder path exists and is accessible."""
127+
from office365.runtime.client_request_exception import ClientRequestException
128+
129+
try:
130+
path_item = site_drive_item.get_by_path(path).get().execute_query()
131+
if path_item is None or not hasattr(path_item, "is_folder"):
132+
raise UserError(
133+
f"SharePoint path '{path}' not found in site {self.connection_config.site}. "
134+
f"Check that the path exists and you have access to it"
135+
)
136+
logger.info(f"SharePoint folder path '{path}' validated successfully")
137+
except ClientRequestException as e:
138+
logger.error(f"Failed to access SharePoint path '{path}': {e}")
139+
self._handle_client_request_exception(e, f"SharePoint path '{path}'")
140+
except Exception as e:
141+
logger.error(f"Unexpected error accessing SharePoint path '{path}': {e}")
142+
raise UserError(f"Failed to validate SharePoint path '{path}': {str(e)}")
143+
144+
@requires_dependencies(["office365"], extras="sharepoint")
145+
def precheck(self) -> None:
146+
"""Validate SharePoint connection before indexing."""
147+
from office365.runtime.client_request_exception import ClientRequestException
148+
149+
# Validate authentication - this call will raise UserAuthError if invalid
150+
self.connection_config.get_token()
151+
152+
try:
153+
client = self.connection_config.get_client()
154+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
155+
site_drive_item = self.connection_config._get_drive_item(client_site)
156+
157+
path = self.index_config.path
158+
if not self._is_root_path(path):
159+
self._validate_folder_path(site_drive_item, path)
160+
161+
logger.info(
162+
f"SharePoint connection validated successfully for site: "
163+
f"{self.connection_config.site}"
164+
)
165+
166+
except ClientRequestException as e:
167+
logger.error(f"SharePoint precheck failed for site: {self.connection_config.site}")
168+
self._handle_client_request_exception(
169+
e, f"SharePoint site {self.connection_config.site}"
170+
)
171+
except Exception as e:
172+
logger.error(f"Unexpected error during SharePoint precheck: {e}", exc_info=True)
173+
raise UserError(f"Failed to validate SharePoint connection: {str(e)}")
174+
94175
@requires_dependencies(["office365"], extras="sharepoint")
95176
async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
96177
from office365.runtime.client_request_exception import ClientRequestException
@@ -113,11 +194,11 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
113194
)
114195

115196
path = self.index_config.path
116-
# Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
117-
if path and path != LEGACY_DEFAULT_PATH:
118-
site_drive_item = site_drive_item.get_by_path(path).get().execute_query()
197+
target_drive_item = await asyncio.to_thread(
198+
self._get_target_drive_item, site_drive_item, path
199+
)
119200

120-
for drive_item in site_drive_item.get_files(
201+
for drive_item in target_drive_item.get_files(
121202
recursive=self.index_config.recursive
122203
).execute_query():
123204
file_data = await self.drive_item_to_file_data(drive_item=drive_item)

0 commit comments

Comments
 (0)