Skip to content

Commit 694368c

Browse files
committed
Added support to scan google drive and google workplace drive
1 parent 40abbd0 commit 694368c

File tree

6 files changed

+211
-5
lines changed

6 files changed

+211
-5
lines changed

connection.yml.sample

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,19 @@ sources:
9090
gdrive:
9191
drive_example:
9292
folder_name:
93-
credentials_file: /Users/kumarohit/Downloads/client_secret.json
93+
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be oauth app json file
94+
cache: true
95+
exclude_patterns:
96+
- .pdf
97+
- .docx
98+
99+
gdrive_workspace:
100+
drive_example:
101+
folder_name:
102+
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be service account json file
103+
impersonate_users:
104+
105+
94106
cache: true
95107
exclude_patterns:
96108
- .pdf

hawk_scanner/commands/gdrive.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
def connect_google_drive(credentials_file):
99
credentials = open(credentials_file, 'r').read()
1010
credentials = json.loads(credentials)
11+
## if installed key is in the credentials file, use it
12+
if 'installed' in credentials:
13+
credentials = credentials['installed']
1114
client_id = credentials['client_id']
1215
client_secret = credentials['client_secret']
1316

@@ -82,6 +85,8 @@ def execute(args):
8285
exclude_patterns = config.get(key, {}).get('exclude_patterns', [])
8386
is_cache_enabled = config.get('cache', False)
8487
drive = connect_google_drive(credentials_file)
88+
if not os.path.exists("data/google_drive"):
89+
os.makedirs("data/google_drive")
8590
if drive:
8691
files = list_files(drive, folder_name=folder_name)
8792
for file_obj in files:
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import os
2+
import json
3+
from google.oauth2 import service_account
4+
from googleapiclient.discovery import build
5+
from hawk_scanner.internals import system
6+
7+
def connect_google_drive(credentials_file, impersonate_user=None):
8+
credentials_json = open(credentials_file, 'r').read()
9+
credentials_json = json.loads(credentials_json)
10+
credentials = service_account.Credentials.from_service_account_file(
11+
credentials_file,
12+
scopes=['https://www.googleapis.com/auth/drive.readonly'],
13+
)
14+
15+
if impersonate_user:
16+
delegated_credentials = credentials.with_subject(impersonate_user)
17+
credentials = delegated_credentials
18+
19+
try:
20+
drive_service = build('drive', 'v3', credentials=credentials)
21+
return drive_service
22+
except Exception as e:
23+
print(f"Failed to connect to Google Drive: {e}")
24+
25+
def download_file(drive, file_obj, base_path):
26+
try:
27+
file_name = file_obj['name']
28+
file_id = file_obj['id']
29+
30+
folder_path = base_path
31+
if 'parents' in file_obj:
32+
for parent_id in file_obj['parents']:
33+
parent_folder = drive.files().get(fileId=parent_id).execute()
34+
if parent_folder['name'] == 'My Drive':
35+
continue
36+
folder_path = os.path.join(folder_path, parent_folder['name'])
37+
38+
file_path = os.path.join(folder_path, file_name)
39+
40+
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
41+
if not os.path.exists(file_path):
42+
os.makedirs(file_path)
43+
folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', [])
44+
for folder_file in folder_files:
45+
download_file(drive, folder_file, folder_path)
46+
else:
47+
download_url = drive.files().get_media(fileId=file_id).execute()
48+
with open(file_path, 'wb') as fh:
49+
fh.write(download_url)
50+
51+
system.print_debug(f"File downloaded to: {file_path}")
52+
except Exception as e:
53+
print(f"Failed to download file: {e}")
54+
55+
def list_files(drive, impersonate_user=None):
56+
try:
57+
query = "'root' in parents"
58+
if impersonate_user:
59+
query += f" and '{impersonate_user}' in owners"
60+
file_list = drive.files().list(q=query).execute().get('files', [])
61+
return file_list
62+
except Exception as e:
63+
print(f"Error listing files: {e}")
64+
return []
65+
66+
def execute(args):
67+
results = []
68+
connections = system.get_connection()
69+
is_cache_enabled = False
70+
71+
if 'sources' in connections:
72+
sources_config = connections['sources']
73+
drive_config = sources_config.get('gdrive_workspace')
74+
else:
75+
system.print_error("No 'sources' section found in connection.yml")
76+
77+
if drive_config:
78+
for key, config in drive_config.items():
79+
credentials_file = config.get('credentials_file')
80+
impersonate_users = config.get('impersonate_users', [])
81+
exclude_patterns = config.get(key, {}).get('exclude_patterns', [])
82+
is_cache_enabled = config.get('cache', False)
83+
84+
for impersonate_user in impersonate_users or [None]:
85+
drive = connect_google_drive(credentials_file, impersonate_user)
86+
if not os.path.exists("data/google_drive"):
87+
os.makedirs("data/google_drive")
88+
if drive:
89+
files = list_files(drive, impersonate_user)
90+
for file_obj in files:
91+
download_file(drive, file_obj, "data/google_drive")
92+
file_id = file_obj['id']
93+
file_name = file_obj['name']
94+
if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder':
95+
continue
96+
97+
parent_folder_ids = file_obj.get('parents', [])
98+
folder_path = "data/google_drive"
99+
if parent_folder_ids:
100+
for parent_id in parent_folder_ids:
101+
parent_folder = drive.files().get(fileId=parent_id).execute()
102+
if parent_folder['name'] == 'My Drive':
103+
continue
104+
folder_path = os.path.join(folder_path, parent_folder['name'])
105+
106+
file_path = os.path.join(folder_path, file_name)
107+
108+
if system.should_exclude_file(file_name, exclude_patterns):
109+
continue
110+
111+
if config.get("cache") and os.path.exists(file_path):
112+
is_cache_enabled = False
113+
system.print_debug(f"File already exists in cache, using it.")
114+
else:
115+
is_cache_enabled = True
116+
117+
if is_cache_enabled:
118+
download_file(drive, file_obj, "data/google_drive")
119+
120+
matches = system.read_match_strings(file_path, 'gdrive')
121+
if matches:
122+
for match in matches:
123+
results.append({
124+
'file_id': file_id,
125+
'file_name': file_name,
126+
'user': impersonate_user,
127+
'file_path': file_path,
128+
'pattern_name': match['pattern_name'],
129+
'matches': match['matches'],
130+
'sample_text': match['sample_text'],
131+
'profile': key,
132+
'data_source': 'gdrive_workspace'
133+
})
134+
else:
135+
system.print_error("Failed to connect to Google Drive")
136+
else:
137+
system.print_error("No Google Drive connection details found in connection file")
138+
139+
if not is_cache_enabled:
140+
os.system("rm -rf data/google_drive")
141+
142+
return results
143+
144+
# Call the execute function with the necessary arguments
145+
# execute(y

hawk_scanner/main.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def clear_screen():
2121
console = Console()
2222

2323
## Now separate the results by data_source
24-
data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive']
24+
data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive', 'gdrive_workspace']
2525

2626
def load_command_module(command):
2727
try:
@@ -101,6 +101,9 @@ def main():
101101
table.add_column("Host > Database > Document ID > Field")
102102
elif group == 'gdrive':
103103
table.add_column("File Name")
104+
elif group == 'gdrive_workspace':
105+
table.add_column("File Name")
106+
table.add_column("User")
104107

105108
table.add_column("Pattern Name")
106109
table.add_column("Total Exposed")
@@ -418,6 +421,35 @@ def main():
418421
exposed_values=records_mini
419422
)
420423

424+
system.SlackNotify(AlertMsg)
425+
elif group == 'gdrive_workspace':
426+
table.add_row(
427+
str(i),
428+
result['profile'],
429+
f"{result['file_name']}",
430+
result['user'],
431+
result['pattern_name'],
432+
str(len(result['matches'])),
433+
records_mini,
434+
result['sample_text'],
435+
)
436+
AlertMsg = """
437+
*** PII Or Secret Found ***
438+
Data Source: Google Drive Workspace - {vulnerable_profile}
439+
File Name: {file_name}
440+
User: {user}
441+
Pattern Name: {pattern_name}
442+
Total Exposed: {total_exposed}
443+
Exposed Values: {exposed_values}
444+
""".format(
445+
vulnerable_profile=result['profile'],
446+
file_name=result['file_name'],
447+
user=result['user'],
448+
pattern_name=result['pattern_name'],
449+
total_exposed=str(len(result['matches'])),
450+
exposed_values=records_mini
451+
)
452+
421453
system.SlackNotify(AlertMsg)
422454
else:
423455
# Handle other cases or do nothing for unsupported groups

readme.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,22 @@ sources:
274274
- venv
275275
- node_modules
276276

277-
gdrive:
277+
gdrive:
278278
drive_example:
279279
folder_name:
280-
credentials_file: /Users/kumarohit/Downloads/client_secret.json
280+
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be oauth app json file
281+
cache: true
282+
exclude_patterns:
283+
- .pdf
284+
- .docx
285+
286+
gdrive_workspace:
287+
drive_example:
288+
folder_name:
289+
credentials_file: /Users/kumarohit/Downloads/client_secret.json ## this will be service account json file
290+
impersonate_users:
291+
292+
281293
cache: true
282294
exclude_patterns:
283295
- .pdf

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "0.3.5"
1+
VERSION = "0.3.6"
22

33
from setuptools import setup, find_packages
44

0 commit comments

Comments
 (0)