1
- import os # Import os for path joining and potential directory checks
1
+ import os
2
2
from collections .abc import Iterable
3
3
from contextlib import suppress
4
+ from enum import Enum
4
5
from pathlib import Path
5
6
from typing import Any , ClassVar
6
7
20
21
21
22
_SCOPES = ["https://www.googleapis.com/auth/drive" ]
22
23
24
+ # Scopes that the service account is delegated for in the Google Workspace Admin Console.
25
+ _IMPERSONATION_SCOPES = [
26
+ "https://www.googleapis.com/auth/cloud-platform" , # General Cloud access (if needed)
27
+ "https://www.googleapis.com/auth/drive" , # Example: For Google Drive API
28
+ ]
29
+
23
30
# HTTP status codes
24
31
_HTTP_NOT_FOUND = 404
25
32
_HTTP_FORBIDDEN = 403
26
33
34
+
35
+ class GoogleDriveExportFormat (str , Enum ):
36
+ """Supported export MIME types for Google Drive downloads."""
37
+
38
+ DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
39
+ XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
40
+ PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
41
+ PDF = "application/pdf"
42
+ PNG = "image/png"
43
+ HTML = "text/html"
44
+ TXT = "text/plain"
45
+ JSON = "application/json"
46
+
47
+
27
48
# Maps Google-native Drive MIME types → export MIME types
28
- _GOOGLE_EXPORT_MIME_MAP = {
29
- "application/vnd.google-apps.document" : "application/vnd.openxmlformats-officedocument.wordprocessingml.document" , # noqa: E501
30
- "application/vnd.google-apps.spreadsheet" : "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" , # noqa: E501
31
- "application/vnd.google-apps.presentation" : "application/vnd.openxmlformats-officedocument.presentationml.presentation" , # noqa: E501
32
- "application/vnd.google-apps.drawing" : "image/png" ,
33
- "application/vnd.google-apps.script" : "application/vnd.google-apps.script+json" ,
34
- "application/vnd.google-apps.site" : "text/html" ,
35
- "application/vnd.google-apps.map" : "application/json" ,
36
- "application/vnd.google-apps.form" : "application/pdf" ,
49
+ _GOOGLE_EXPORT_MIME_MAP : dict [ str , GoogleDriveExportFormat ] = {
50
+ "application/vnd.google-apps.document" : GoogleDriveExportFormat . DOCX ,
51
+ "application/vnd.google-apps.spreadsheet" : GoogleDriveExportFormat . XLSX ,
52
+ "application/vnd.google-apps.presentation" : GoogleDriveExportFormat . PDF ,
53
+ "application/vnd.google-apps.drawing" : GoogleDriveExportFormat . PNG ,
54
+ "application/vnd.google-apps.script" : GoogleDriveExportFormat . JSON ,
55
+ "application/vnd.google-apps.site" : GoogleDriveExportFormat . HTML ,
56
+ "application/vnd.google-apps.map" : GoogleDriveExportFormat . JSON ,
57
+ "application/vnd.google-apps.form" : GoogleDriveExportFormat . PDF ,
37
58
}
38
59
39
60
# Maps export MIME types → file extensions
40
- _EXPORT_EXTENSION_MAP = {
41
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : ".docx" ,
42
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" : ".xlsx" ,
43
- "application/vnd.openxmlformats-officedocument.presentationml.presentation" : ".pptx" ,
44
- "image/png" : ".png" ,
45
- "application/pdf" : ".pdf" ,
46
- "text/html" : ".html" ,
47
- "text/plain" : ".txt" ,
48
- "application/json" : ".json" ,
61
+ _EXPORT_EXTENSION_MAP : dict [ GoogleDriveExportFormat , str ] = {
62
+ GoogleDriveExportFormat . DOCX : ".docx" ,
63
+ GoogleDriveExportFormat . XLSX : ".xlsx" ,
64
+ GoogleDriveExportFormat . PPTX : ".pptx" ,
65
+ GoogleDriveExportFormat . PNG : ".png" ,
66
+ GoogleDriveExportFormat . PDF : ".pdf" ,
67
+ GoogleDriveExportFormat . HTML : ".html" ,
68
+ GoogleDriveExportFormat . TXT : ".txt" ,
69
+ GoogleDriveExportFormat . JSON : ".json" ,
49
70
}
50
71
51
72
52
73
class GoogleDriveSource (Source ):
53
74
"""
54
75
Handles source connection for Google Drive and provides methods to fetch files.
76
+
77
+ NOTE(Do not define variables at class level that you pass to google client, define them at instance level, or else
78
+ google client will complain.):
55
79
"""
56
80
57
81
file_id : str
@@ -62,12 +86,31 @@ class GoogleDriveSource(Source):
62
86
63
87
_google_drive_client : ClassVar ["GoogleAPIResource | None" ] = None
64
88
_credentials_file_path : ClassVar [str | None ] = None
89
+ impersonate : ClassVar [bool | None ] = None
90
+ impersonate_target_email : ClassVar [str | None ] = None
65
91
66
92
@classmethod
67
93
def set_credentials_file_path (cls , path : str ) -> None :
68
94
"""Set the path to the service account credentials file."""
69
95
cls ._credentials_file_path = path
70
96
97
+ @classmethod
98
+ def set_impersonation_target (cls , target_mail : str ) -> None :
99
+ """
100
+ Sets the email address to impersonate when accessing Google Drive resources.
101
+
102
+ Args:
103
+ target_mail (str): The email address to impersonate.
104
+
105
+ Raises:
106
+ ValueError: If the provided email address is invalid (empty or missing '@').
107
+ """
108
+ # check if email is a valid email.
109
+ if not target_mail or "@" not in target_mail :
110
+ raise ValueError ("Invalid email address provided for impersonation." )
111
+ cls .impersonate = True
112
+ cls .impersonate_target_email = target_mail
113
+
71
114
@classmethod
72
115
def _initialize_client_from_creds (cls ) -> None :
73
116
"""
@@ -82,7 +125,20 @@ def _initialize_client_from_creds(cls) -> None:
82
125
HttpError: If the Google Drive API is not enabled or accessible.
83
126
Exception: If any other error occurs during client initialization.
84
127
"""
85
- creds = service_account .Credentials .from_service_account_file (cls ._credentials_file_path , scopes = _SCOPES )
128
+ cred_kwargs = {
129
+ "filename" : cls ._credentials_file_path ,
130
+ "scopes" : _SCOPES ,
131
+ }
132
+
133
+ # handle impersonation
134
+ if cls .impersonate is not None and cls .impersonate :
135
+ if not cls .impersonate_target_email :
136
+ raise ValueError ("Impersonation target email must be set when impersonation is enabled." )
137
+ cred_kwargs ["subject" ] = cls .impersonate_target_email
138
+ cred_kwargs ["scopes" ] = _IMPERSONATION_SCOPES
139
+
140
+ creds = service_account .Credentials .from_service_account_file (** cred_kwargs )
141
+
86
142
cls ._google_drive_client = build ("drive" , "v3" , credentials = creds )
87
143
cls ._google_drive_client .files ().list (
88
144
pageSize = 1 , fields = "files(id)" , supportsAllDrives = True , includeItemsFromAllDrives = True
@@ -162,7 +218,11 @@ def verify_drive_api_enabled(cls) -> None:
162
218
163
219
@traceable
164
220
@requires_dependencies (["googleapiclient" ], "google_drive" )
165
- async def fetch (self ) -> Path :
221
+ async def fetch (
222
+ self ,
223
+ * ,
224
+ export_format : "GoogleDriveExportFormat | None" = None ,
225
+ ) -> Path :
166
226
"""
167
227
Fetch the file from Google Drive and store it locally.
168
228
@@ -171,6 +231,9 @@ async def fetch(self) -> Path:
171
231
The local directory is determined by the environment variable `LOCAL_STORAGE_DIR`. If this environment
172
232
variable is not set, a temporary directory is used.
173
233
234
+ Args:
235
+ export_format: Optional override for the export MIME type when downloading Google-native documents.
236
+
174
237
Returns:
175
238
The local path to the downloaded file.
176
239
@@ -186,7 +249,8 @@ async def fetch(self) -> Path:
186
249
file_local_dir = local_dir / self .file_id
187
250
file_local_dir .mkdir (parents = True , exist_ok = True )
188
251
189
- export_mime_type , file_extension = self ._determine_file_extension ()
252
+ override_mime = export_format .value if export_format else None
253
+ export_mime_type , file_extension = self ._determine_file_extension (override_mime = override_mime )
190
254
local_file_name = f"{ self .file_name } { file_extension } "
191
255
path = file_local_dir / local_file_name
192
256
@@ -496,22 +560,36 @@ async def from_uri(cls, path: str) -> Iterable[Self]:
496
560
else :
497
561
raise ValueError (f"Unsupported Google Drive URI pattern: { path } " )
498
562
499
- def _determine_file_extension (self ) -> tuple [str , str ]:
563
+ def _determine_file_extension (self , override_mime : str | None = None ) -> tuple [str , str ]:
500
564
"""
501
565
Determine the appropriate file extension and export MIME type for the file.
502
566
503
567
Returns:
504
568
A tuple of (export_mime_type, file_extension)
505
569
"""
570
+ if override_mime is not None :
571
+ export_mime_type = override_mime
572
+ try :
573
+ export_format = GoogleDriveExportFormat (override_mime )
574
+ file_extension = _EXPORT_EXTENSION_MAP .get (export_format , ".bin" )
575
+ except ValueError :
576
+ file_extension = Path (self .file_name ).suffix if "." in self .file_name else ".bin"
577
+ return export_mime_type , file_extension
578
+
506
579
export_mime_type = self .mime_type
507
580
file_extension = ""
508
581
509
582
if self .mime_type .startswith ("application/vnd.google-apps" ):
510
- export_mime_type = _GOOGLE_EXPORT_MIME_MAP .get (self .mime_type , "application/pdf" )
511
- file_extension = _EXPORT_EXTENSION_MAP .get (export_mime_type , ".bin" )
583
+ export_format = _GOOGLE_EXPORT_MIME_MAP .get (self .mime_type , GoogleDriveExportFormat .PDF )
584
+ export_mime_type = export_format .value
585
+ file_extension = _EXPORT_EXTENSION_MAP .get (export_format , ".bin" )
512
586
elif "." in self .file_name :
513
587
file_extension = Path (self .file_name ).suffix
514
588
else :
515
- file_extension = _EXPORT_EXTENSION_MAP .get (self .mime_type , ".bin" )
589
+ try :
590
+ export_format = GoogleDriveExportFormat (self .mime_type )
591
+ file_extension = _EXPORT_EXTENSION_MAP .get (export_format , ".bin" )
592
+ except ValueError :
593
+ file_extension = ".bin"
516
594
517
595
return export_mime_type , file_extension
0 commit comments