1313 SourceConnectionError ,
1414 SourceConnectionNetworkError ,
1515)
16+ from unstructured_ingest .errors_v2 import UserAuthError , UserError
1617from unstructured_ingest .logger import logger
1718from unstructured_ingest .processes .connector_registry import (
1819 SourceRegistryEntry ,
3031if TYPE_CHECKING :
3132 from office365 .onedrive .driveitems .driveItem import DriveItem
3233 from office365 .onedrive .sites .site import Site
34+ from office365 .runtime .client_request_exception import ClientRequestException
3335
3436CONNECTOR_TYPE = "sharepoint"
3537LEGACY_DEFAULT_PATH = "Shared Documents"
@@ -82,15 +84,94 @@ def _get_drive_item(self, client_site: Site) -> DriveItem:
8284
8385
8486class SharepointIndexerConfig (OnedriveIndexerConfig ):
85- pass
86-
87+ # TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
88+ path : str = Field ( default = "" )
8789
8890@dataclass
8991class SharepointIndexer (OnedriveIndexer ):
9092 connection_config : SharepointConnectionConfig
9193 index_config : SharepointIndexerConfig
9294 connector_type : str = CONNECTOR_TYPE
9395
96+ def _handle_client_request_exception (self , e : ClientRequestException , context : str ) -> None :
97+ """Convert ClientRequestException to appropriate user-facing error based on HTTP status."""
98+ if hasattr (e , "response" ) and e .response is not None and hasattr (e .response , "status_code" ):
99+ status_code = e .response .status_code
100+ if status_code == 401 :
101+ raise UserAuthError (
102+ f"Unauthorized access to { context } . Check client credentials and permissions"
103+ )
104+ elif status_code == 403 :
105+ raise UserAuthError (
106+ f"Access forbidden to { context } . "
107+ f"Check app permissions (Sites.Read.All required)"
108+ )
109+ elif status_code == 404 :
110+ raise UserError (f"Not found: { context } " )
111+
112+ raise UserError (f"Failed to access { context } : { str (e )} " )
113+
114+ def _is_root_path (self , path : str ) -> bool :
115+ """Check if the path represents root access (empty string or legacy default)."""
116+ return not path or not path .strip () or path == LEGACY_DEFAULT_PATH
117+
118+ def _get_target_drive_item (self , site_drive_item : DriveItem , path : str ) -> DriveItem :
119+ """Get the drive item to search in based on the path."""
120+ if self ._is_root_path (path ):
121+ return site_drive_item
122+ else :
123+ return site_drive_item .get_by_path (path ).get ().execute_query ()
124+
125+ def _validate_folder_path (self , site_drive_item : DriveItem , path : str ) -> None :
126+ """Validate that a specific folder path exists and is accessible."""
127+ from office365 .runtime .client_request_exception import ClientRequestException
128+
129+ try :
130+ path_item = site_drive_item .get_by_path (path ).get ().execute_query ()
131+ if path_item is None or not hasattr (path_item , "is_folder" ):
132+ raise UserError (
133+ f"SharePoint path '{ path } ' not found in site { self .connection_config .site } . "
134+ f"Check that the path exists and you have access to it"
135+ )
136+ logger .info (f"SharePoint folder path '{ path } ' validated successfully" )
137+ except ClientRequestException as e :
138+ logger .error (f"Failed to access SharePoint path '{ path } ': { e } " )
139+ self ._handle_client_request_exception (e , f"SharePoint path '{ path } '" )
140+ except Exception as e :
141+ logger .error (f"Unexpected error accessing SharePoint path '{ path } ': { e } " )
142+ raise UserError (f"Failed to validate SharePoint path '{ path } ': { str (e )} " )
143+
144+ @requires_dependencies (["office365" ], extras = "sharepoint" )
145+ def precheck (self ) -> None :
146+ """Validate SharePoint connection before indexing."""
147+ from office365 .runtime .client_request_exception import ClientRequestException
148+
149+ # Validate authentication - this call will raise UserAuthError if invalid
150+ self .connection_config .get_token ()
151+
152+ try :
153+ client = self .connection_config .get_client ()
154+ client_site = client .sites .get_by_url (self .connection_config .site ).get ().execute_query ()
155+ site_drive_item = self .connection_config ._get_drive_item (client_site )
156+
157+ path = self .index_config .path
158+ if not self ._is_root_path (path ):
159+ self ._validate_folder_path (site_drive_item , path )
160+
161+ logger .info (
162+ f"SharePoint connection validated successfully for site: "
163+ f"{ self .connection_config .site } "
164+ )
165+
166+ except ClientRequestException as e :
167+ logger .error (f"SharePoint precheck failed for site: { self .connection_config .site } " )
168+ self ._handle_client_request_exception (
169+ e , f"SharePoint site { self .connection_config .site } "
170+ )
171+ except Exception as e :
172+ logger .error (f"Unexpected error during SharePoint precheck: { e } " , exc_info = True )
173+ raise UserError (f"Failed to validate SharePoint connection: { str (e )} " )
174+
94175 @requires_dependencies (["office365" ], extras = "sharepoint" )
95176 async def run_async (self , ** kwargs : Any ) -> AsyncIterator [FileData ]:
96177 from office365 .runtime .client_request_exception import ClientRequestException
@@ -113,11 +194,11 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
113194 )
114195
115196 path = self .index_config .path
116- # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
117- if path and path != LEGACY_DEFAULT_PATH :
118- site_drive_item = site_drive_item . get_by_path ( path ). get (). execute_query ( )
197+ target_drive_item = await asyncio . to_thread (
198+ self . _get_target_drive_item , site_drive_item , path
199+ )
119200
120- for drive_item in site_drive_item .get_files (
201+ for drive_item in target_drive_item .get_files (
121202 recursive = self .index_config .recursive
122203 ).execute_query ():
123204 file_data = await self .drive_item_to_file_data (drive_item = drive_item )
0 commit comments