|
| 1 | +import os |
| 2 | +import json |
| 3 | +from google.oauth2 import service_account |
| 4 | +from googleapiclient.discovery import build |
| 5 | +from hawk_scanner.internals import system |
| 6 | + |
| 7 | +def connect_google_drive(credentials_file, impersonate_user=None): |
| 8 | + credentials_json = open(credentials_file, 'r').read() |
| 9 | + credentials_json = json.loads(credentials_json) |
| 10 | + credentials = service_account.Credentials.from_service_account_file( |
| 11 | + credentials_file, |
| 12 | + scopes=['https://www.googleapis.com/auth/drive.readonly'], |
| 13 | + ) |
| 14 | + |
| 15 | + if impersonate_user: |
| 16 | + delegated_credentials = credentials.with_subject(impersonate_user) |
| 17 | + credentials = delegated_credentials |
| 18 | + |
| 19 | + try: |
| 20 | + drive_service = build('drive', 'v3', credentials=credentials) |
| 21 | + return drive_service |
| 22 | + except Exception as e: |
| 23 | + print(f"Failed to connect to Google Drive: {e}") |
| 24 | + |
| 25 | +def download_file(drive, file_obj, base_path): |
| 26 | + try: |
| 27 | + file_name = file_obj['name'] |
| 28 | + file_id = file_obj['id'] |
| 29 | + |
| 30 | + folder_path = base_path |
| 31 | + if 'parents' in file_obj: |
| 32 | + for parent_id in file_obj['parents']: |
| 33 | + parent_folder = drive.files().get(fileId=parent_id).execute() |
| 34 | + if parent_folder['name'] == 'My Drive': |
| 35 | + continue |
| 36 | + folder_path = os.path.join(folder_path, parent_folder['name']) |
| 37 | + |
| 38 | + file_path = os.path.join(folder_path, file_name) |
| 39 | + |
| 40 | + if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder': |
| 41 | + if not os.path.exists(file_path): |
| 42 | + os.makedirs(file_path) |
| 43 | + folder_files = drive.files().list(q=f"'{file_id}' in parents").execute().get('files', []) |
| 44 | + for folder_file in folder_files: |
| 45 | + download_file(drive, folder_file, folder_path) |
| 46 | + else: |
| 47 | + download_url = drive.files().get_media(fileId=file_id).execute() |
| 48 | + with open(file_path, 'wb') as fh: |
| 49 | + fh.write(download_url) |
| 50 | + |
| 51 | + system.print_debug(f"File downloaded to: {file_path}") |
| 52 | + except Exception as e: |
| 53 | + print(f"Failed to download file: {e}") |
| 54 | + |
| 55 | +def list_files(drive, impersonate_user=None): |
| 56 | + try: |
| 57 | + query = "'root' in parents" |
| 58 | + if impersonate_user: |
| 59 | + query += f" and '{impersonate_user}' in owners" |
| 60 | + file_list = drive.files().list(q=query).execute().get('files', []) |
| 61 | + return file_list |
| 62 | + except Exception as e: |
| 63 | + print(f"Error listing files: {e}") |
| 64 | + return [] |
| 65 | + |
| 66 | +def execute(args): |
| 67 | + results = [] |
| 68 | + connections = system.get_connection() |
| 69 | + is_cache_enabled = False |
| 70 | + |
| 71 | + if 'sources' in connections: |
| 72 | + sources_config = connections['sources'] |
| 73 | + drive_config = sources_config.get('gdrive_workspace') |
| 74 | + else: |
| 75 | + system.print_error("No 'sources' section found in connection.yml") |
| 76 | + |
| 77 | + if drive_config: |
| 78 | + for key, config in drive_config.items(): |
| 79 | + credentials_file = config.get('credentials_file') |
| 80 | + impersonate_users = config.get('impersonate_users', []) |
| 81 | + exclude_patterns = config.get(key, {}).get('exclude_patterns', []) |
| 82 | + is_cache_enabled = config.get('cache', False) |
| 83 | + |
| 84 | + for impersonate_user in impersonate_users or [None]: |
| 85 | + drive = connect_google_drive(credentials_file, impersonate_user) |
| 86 | + if not os.path.exists("data/google_drive"): |
| 87 | + os.makedirs("data/google_drive") |
| 88 | + if drive: |
| 89 | + files = list_files(drive, impersonate_user) |
| 90 | + for file_obj in files: |
| 91 | + download_file(drive, file_obj, "data/google_drive") |
| 92 | + file_id = file_obj['id'] |
| 93 | + file_name = file_obj['name'] |
| 94 | + if 'mimeType' in file_obj and file_obj['mimeType'] == 'application/vnd.google-apps.folder': |
| 95 | + continue |
| 96 | + |
| 97 | + parent_folder_ids = file_obj.get('parents', []) |
| 98 | + folder_path = "data/google_drive" |
| 99 | + if parent_folder_ids: |
| 100 | + for parent_id in parent_folder_ids: |
| 101 | + parent_folder = drive.files().get(fileId=parent_id).execute() |
| 102 | + if parent_folder['name'] == 'My Drive': |
| 103 | + continue |
| 104 | + folder_path = os.path.join(folder_path, parent_folder['name']) |
| 105 | + |
| 106 | + file_path = os.path.join(folder_path, file_name) |
| 107 | + |
| 108 | + if system.should_exclude_file(file_name, exclude_patterns): |
| 109 | + continue |
| 110 | + |
| 111 | + if config.get("cache") and os.path.exists(file_path): |
| 112 | + is_cache_enabled = False |
| 113 | + system.print_debug(f"File already exists in cache, using it.") |
| 114 | + else: |
| 115 | + is_cache_enabled = True |
| 116 | + |
| 117 | + if is_cache_enabled: |
| 118 | + download_file(drive, file_obj, "data/google_drive") |
| 119 | + |
| 120 | + matches = system.read_match_strings(file_path, 'gdrive') |
| 121 | + if matches: |
| 122 | + for match in matches: |
| 123 | + results.append({ |
| 124 | + 'file_id': file_id, |
| 125 | + 'file_name': file_name, |
| 126 | + 'user': impersonate_user, |
| 127 | + 'file_path': file_path, |
| 128 | + 'pattern_name': match['pattern_name'], |
| 129 | + 'matches': match['matches'], |
| 130 | + 'sample_text': match['sample_text'], |
| 131 | + 'profile': key, |
| 132 | + 'data_source': 'gdrive_workspace' |
| 133 | + }) |
| 134 | + else: |
| 135 | + system.print_error("Failed to connect to Google Drive") |
| 136 | + else: |
| 137 | + system.print_error("No Google Drive connection details found in connection file") |
| 138 | + |
| 139 | + if not is_cache_enabled: |
| 140 | + os.system("rm -rf data/google_drive") |
| 141 | + |
| 142 | + return results |
| 143 | + |
| 144 | +# Call the execute function with the necessary arguments |
| 145 | +# execute(y |
0 commit comments