Skip to content

Commit 69320f3

Browse files
Fix stripping extensions in google drive (#460)
Fix extension dot stripping in google drive connector
1 parent c70ee0b commit 69320f3

File tree

3 files changed

+43
-18
lines changed

3 files changed

+43
-18
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 1.0.6-dev0
2+
3+
### Fixes
4+
5+
* **Google Drive connector now strips the leading dot in extensions properly**
6+
17
## 1.0.5
28

39
### Fixes

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.5" # pragma: no cover
1+
__version__ = "1.0.6-dev0" # pragma: no cover

unstructured_ingest/processes/connectors/google_drive.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,12 @@
5050

5151

5252
class GoogleDriveAccessConfig(AccessConfig):
53-
service_account_key: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
54-
default=None, description="Credentials values to use for authentication"
55-
)
53+
service_account_key: Optional[
54+
Annotated[dict, BeforeValidator(conform_string_to_dict)]
55+
] = Field(default=None, description="Credentials values to use for authentication")
5656
service_account_key_path: Optional[Path] = Field(
57-
default=None, description="File path to credentials values to use for authentication"
57+
default=None,
58+
description="File path to credentials values to use for authentication",
5859
)
5960

6061
def model_post_init(self, __context: Any) -> None:
@@ -111,10 +112,9 @@ class GoogleDriveIndexerConfig(IndexerConfig):
111112
extensions: Optional[list[str]] = None
112113
recursive: bool = False
113114

114-
def __post_init__(self):
115-
# Strip leading period of extension
115+
def model_post_init(self, __context: Any) -> None:
116116
if self.extensions is not None:
117-
self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions]
117+
self.extensions = [e.lstrip(".") for e in self.extensions]
118118

119119

120120
@dataclass
@@ -165,10 +165,14 @@ def verify_drive_api_enabled(client) -> None:
165165
Please enable it in the Google Cloud Console."
166166
)
167167
else:
168-
raise SourceConnectionError("Google drive API unreachable for an unknown reason!")
168+
raise SourceConnectionError(
169+
"Google drive API unreachable for an unknown reason!"
170+
)
169171

170172
@staticmethod
171-
def count_files_recursively(files_client, folder_id: str, extensions: list[str] = None) -> int:
173+
def count_files_recursively(
174+
files_client, folder_id: str, extensions: list[str] = None
175+
) -> int:
172176
"""
173177
Count non-folder files recursively under the given folder.
174178
If `extensions` is provided, only count files
@@ -247,7 +251,9 @@ def precheck(self) -> None:
247251
# that the service account has proper permissions."
248252
# )
249253
else:
250-
logger.info(f"Found {file_count} files recursively in the folder.")
254+
logger.info(
255+
f"Found {file_count} files recursively in the folder."
256+
)
251257
else:
252258
# Non-recursive: check for at least one immediate non-folder child.
253259
response = client.list(
@@ -275,7 +281,8 @@ def precheck(self) -> None:
275281

276282
except Exception as e:
277283
logger.error(
278-
"Failed to validate Google Drive connection during precheck", exc_info=True
284+
"Failed to validate Google Drive connection during precheck",
285+
exc_info=True,
279286
)
280287
raise SourceConnectionError(f"Precheck failed: {e}")
281288

@@ -295,7 +302,9 @@ def map_file_data(f: dict) -> FileData:
295302
date_modified_str = f.pop("modifiedTime", None)
296303
parent_path = f.pop("parent_path", None)
297304
parent_root_path = f.pop("parent_root_path", None)
298-
date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
305+
date_modified_dt = (
306+
parser.parse(date_modified_str) if date_modified_str else None
307+
)
299308
if (
300309
parent_path
301310
and isinstance(parent_path, str)
@@ -380,7 +389,9 @@ def get_paginated_results(
380389
return files_response
381390

382391
def get_root_info(self, files_client, object_id: str) -> dict:
383-
return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute()
392+
return files_client.get(
393+
fileId=object_id, fields=",".join(self.fields)
394+
).execute()
384395

385396
def get_files(
386397
self,
@@ -391,7 +402,9 @@ def get_files(
391402
) -> list[FileData]:
392403
root_info = self.get_root_info(files_client=files_client, object_id=object_id)
393404
if not self.is_dir(root_info):
394-
root_info["permissions"] = self.extract_permissions(root_info.get("permissions"))
405+
root_info["permissions"] = self.extract_permissions(
406+
root_info.get("permissions")
407+
)
395408
data = [self.map_file_data(root_info)]
396409
else:
397410
file_contents = self.get_paginated_results(
@@ -476,13 +489,19 @@ def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
476489
_, downloaded = downloader.next_chunk()
477490
return downloaded
478491

479-
def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
492+
def _write_file(
493+
self, file_data: FileData, file_contents: io.BytesIO
494+
) -> DownloadResponse:
480495
download_path = self.get_download_path(file_data=file_data)
481496
download_path.parent.mkdir(parents=True, exist_ok=True)
482-
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
497+
logger.debug(
498+
f"writing {file_data.source_identifiers.fullpath} to {download_path}"
499+
)
483500
with open(download_path, "wb") as handler:
484501
handler.write(file_contents.getbuffer())
485-
return self.generate_download_response(file_data=file_data, download_path=download_path)
502+
return self.generate_download_response(
503+
file_data=file_data, download_path=download_path
504+
)
486505

487506
@requires_dependencies(["googleapiclient"], extras="google-drive")
488507
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:

0 commit comments

Comments
 (0)