From 722b35d76d2427b599d7d580b4db353668040e6c Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 13 Dec 2022 14:57:09 -0500 Subject: [PATCH 001/111] workspace mapping scripts --- convert_all_logs.py | 107 +++++ map.py | 31 ++ requirements.txt | 38 ++ utils/create_asset_mapping_spreadsheet.py | 75 +++ utils/create_workspace.py | 112 +++++ utils/split_logs.py | 553 ++++++++++++++++++++++ utils/to_csv.py | 229 +++++++++ 7 files changed, 1145 insertions(+) create mode 100644 convert_all_logs.py create mode 100644 map.py create mode 100644 requirements.txt create mode 100644 utils/create_asset_mapping_spreadsheet.py create mode 100644 utils/create_workspace.py create mode 100644 utils/split_logs.py create mode 100644 utils/to_csv.py diff --git a/convert_all_logs.py b/convert_all_logs.py new file mode 100644 index 00000000..a8f1e942 --- /dev/null +++ b/convert_all_logs.py @@ -0,0 +1,107 @@ +###################### importing other scripts ############################################## +from utils import to_csv as util +from utils import create_asset_mapping_spreadsheet as create_spreadsheet +############################################################################################ +import argparse +import os + +def main(checkpoint): + # where you want the csv files to be located + # make the csv directory if it not there + if "csv" not in os.listdir(): + os.mkdir("./csv") + + # users + try: + users_data = util.read_log("users.log", checkpoint) + users_df = util.create_users(users_data) + util.save_to_csv(users_df, "users.csv") + except: + print("Error while trying to read users. Skipping...") + + # instance profiles + try: + ip_data = util.read_log("instance_profiles.log", checkpoint) + ip_df = util.create_instance_profiles(ip_data) + util.save_to_csv(ip_df, "instance_profiles.csv") + except: + print("Error while trying to read instance profiles. Skipping...") + + try: + ipo_data = util.read_log("instance_pools.log", checkpoint) + ipo_df = util.create_instance_pools(ipo_data) + util.save_to_csv(ipo_df, "instance_pools.csv") + except: + print("Error while trying to read instance pools. Skipping...") + + # groups + try: + groups_df = util.create_groups(checkpoint, directory_name = "groups") + util.save_to_csv(groups_df, "groups.csv") + except: + print("Error while trying to read users. Skipping...") + + + # clusters + try: + clusters_data = util.read_log("clusters.log", checkpoint) + clusters_df = util.create_clusters(clusters_data) + util.save_to_csv(clusters_df, "clusters.csv") + except: + print("Error while trying to read clusters. Skipping...") + + # job + try: + jobs_data = util.read_log('jobs.log', checkpoint) + jobs_df = util.create_jobs(jobs_data) + util.save_to_csv(jobs_df, "jobs.csv") + except: + print("Error while trying to read jobs. Skipping...") + + # shared + try: + shared_df = util.create_shared_logs(checkpoint, directory_name = "artifacts/Shared") + util.save_to_csv(shared_df, 'shared_logs.csv') + except: + print("Error while trying to read shared directory. Skipping...") + + # other artificats + try: + other_df = util.create_other_artifacts(checkpoint, directory_name = "artifacts") + util.save_to_csv(other_df, "top_level_artifacts.csv") + except: + print("Error while trying to read global artifacts. Skipping...") + + # libraries + try: + libraries_data = util.read_log("libraries.log", checkpoint) + libraries_df = util.create_libraries(libraries_data) + util.save_to_csv(libraries_df, "libraries.csv") + except: + print("Error while trying to read libraries. Skipping...") + + # secret scopes + try: + scopes_df = util.create_scopes(checkpoint, directory_name = 'secret_scopes') + util.save_to_csv(scopes_df, "secret_scopes.csv") + except: + prnit("Error while trying to read secrets. Skipping...") + + # metastore + try: + metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore') + util.save_to_csv(metastore_df, "metastore.csv") + except: + print('Error while trying to read metastore. Skipping..') + + create_spreadsheet.csv_to_excel("./csv") + print("Sucessfully created spreadsheet asset_mapping.xlsx. ") + +if __name__ == "__main__": + + + all_args = argparse.ArgumentParser() + all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export") + + args = all_args.parse_args() + main(args.checkpoint) diff --git a/map.py b/map.py new file mode 100644 index 00000000..5b17e69d --- /dev/null +++ b/map.py @@ -0,0 +1,31 @@ +import argparse +from utils.create_workspace import Workspace as Workspace + +def main(): + # checkpoints are optional for export but you will need to use them for the import + # (each workspace is a 'checkpoint') + + # takes two arguments: checkpoint and workspaces + all_args = argparse.ArgumentParser() + all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export") + all_args.add_argument("--workspaces", dest="workspaces", nargs="+", required=True, help="list of workspace names. must match csv columns") + + args = all_args.parse_args() + + checkpoint = args.checkpoint + workspaces = args.workspaces + + + # for each workspace + for w in workspaces: + # create a workspace Class - refer to create_workspace.py + # this instantiates the original location of the session and the new location of the session + # it also instantiates another class Split - refer to split_logs.py + # Split instantiates the same thing as well as two variables: imported users and imported groups (this is used for remaking ACLs) + workspace = Workspace(checkpoint, w, workspaces) + success = workspace.run() + + workspace.copy_other_files() + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..b8883efe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,38 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: osx-64 +blas=1.0=mkl +bottleneck=1.3.4=py310h4e76f89_0 +bzip2=1.0.8=h1de35cc_0 +ca-certificates=2022.4.26=hecd8cb5_0 +certifi=2022.6.15=py310hecd8cb5_0 +et_xmlfile=1.1.0=py310hecd8cb5_0 +intel-openmp=2021.4.0=hecd8cb5_3538 +libcxx=12.0.0=h2f01273_0 +libffi=3.3=hb1e8313_2 +mkl=2021.4.0=hecd8cb5_637 +mkl-service=2.4.0=py310hca72f7f_0 +mkl_fft=1.3.1=py310hf879493_0 +mkl_random=1.2.2=py310hc081a56_0 +ncurses=6.3=hca72f7f_2 +numexpr=2.8.1=py310hdcd3fac_2 +numpy=1.22.3=py310hdcd3fac_0 +numpy-base=1.22.3=py310hfd2de13_0 +openpyxl=3.0.9=pyhd3eb1b0_0 +openssl=1.1.1o=hca72f7f_0 +packaging=21.3=pyhd3eb1b0_0 +pandas=1.4.2=py310he9d5cce_0 +pip=21.2.4=py310hecd8cb5_0 +pyparsing=3.0.4=pyhd3eb1b0_0 +python=3.10.4=hdfd78df_0 +python-dateutil=2.8.2=pyhd3eb1b0_0 +pytz=2022.1=py310hecd8cb5_0 +readline=8.1.2=hca72f7f_1 +setuptools=61.2.0=py310hecd8cb5_0 +six=1.16.0=pyhd3eb1b0_1 +sqlite=3.38.5=h707629a_0 +tk=8.6.12=h5d9f67b_0 +tzdata=2022a=hda174b7_0 +wheel=0.37.1=pyhd3eb1b0_0 +xz=5.2.5=hca72f7f_1 +zlib=1.2.12=h4dc903c_2 diff --git a/utils/create_asset_mapping_spreadsheet.py b/utils/create_asset_mapping_spreadsheet.py new file mode 100644 index 00000000..9686b52c --- /dev/null +++ b/utils/create_asset_mapping_spreadsheet.py @@ -0,0 +1,75 @@ +''' +This script creates an asset mapping Excel spreadsheet using the csvs created by convert_all_logs.py +Each csv gets its own sheet and adds a tag column for the customer to tag assets +The resulting .xlsx file can be imported into a Google Sheet +''' + +import os, csv, openpyxl + + +def csv_to_excel(input_folder): + + # Instantiate a new Excel workbook + workbook = openpyxl.Workbook() + + # Loop through csv's and read each one line-by-line using csv.reader + for csv_file in os.listdir(input_folder): + csv_data = [] + + full_path = input_folder + '/' + csv_file + + with open(full_path) as file_obj: + reader = csv.reader(file_obj) + for row in reader: + csv_data.append(row) + + # Use the name of the csv file as the sheet name + sheet_name = os.path.splitext(csv_file)[0] + + # Create new sheet + workbook.create_sheet(title=sheet_name) + sheet = workbook[sheet_name] + + # Insert csv data into sheet + for row in csv_data: + sheet.append(row) + + # Insert a tag column before column at index 2 + sheet.insert_cols(2) + sheet['B1'] = 'tag' + + # Freeze the first row and first two columns + sheet.freeze_panes = sheet['C2'] + + # Resizing columns to fit cell contents, has a max value in case columns are very wide + for col in sheet.columns: + max_length = 0 + column = col[0].column_letter # Get the column name + for cell in col: + try: # Necessary to avoid error on empty cells + if len(str(cell.value)) > 0: + max_length = len(str(cell.value)) + except: + pass + + # Copied this formula from an example, seems to do a good job + adjusted_width = round((max_length + 2) * 1.2, 0) + + # Keeps column widths from getting too large, 75 is arbitrary + if adjusted_width > 75: + adjusted_width = 75 + + sheet.column_dimensions[column].width = adjusted_width + + # Remove the default sheet + default_sheet = workbook['Sheet'] + workbook.remove(default_sheet) + + # Save the Excel file + workbook.save("asset_mapping.xlsx") + +def main(): + csv_to_excel(os.getcwd() + '/csv') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/utils/create_workspace.py b/utils/create_workspace.py new file mode 100644 index 00000000..6c185862 --- /dev/null +++ b/utils/create_workspace.py @@ -0,0 +1,112 @@ +from utils.split_logs import Split +import os +import json +import shutil +import pandas as pd +import datetime + +class Workspace(): + def __init__(self, checkpoint, workspace, all_workspaces): + self.path = "./logs/"+checkpoint+"/" + self.workspace = str(workspace) + self.new_path = "./logs/"+checkpoint+"_"+workspace+"/" + self.workspaces = all_workspaces + self.checkpoint = checkpoint + split = Split(checkpoint, workspace) + + # this is where all assets are mapped to what csv they refer to + what function they use for the split + self.map = { + 'users': ["users", split.users], + 'instance_pools' : ["instance_pools", split.instance_pools], + 'instance_profiles': ["instance_profiles", split.instance_profiles], + 'groups': ["groups", split.groups], + 'jobs': ["jobs", split.jobs], + 'acl_jobs': ["jobs", split.acl_jobs], + 'secret_scopes': ["secret_scopes", split.secret_scopes], + 'secret_scopes_acls':["secret_scopes", split.secret_scopes_acls], + 'clusters': ["clusters", split.clusters], + 'cluster_policies': ["clusters", split.cluster_policy], + 'acl_clusters':["clusters", split.acl_clusters], + 'acl_cluster_policies': ["clusters", split.acl_cluster_policies], + 'mounts': ["mounts", split.mounts], + 'shared_notebooks': ["global_shared_logs", split.shared_notebooks], + 'global_notebooks': ["global_logs", split.global_notebooks], + 'user_notebooks': ["users", split.user_notebooks], + 'user_dirs': ["users", split.user_dirs], + 'user_workspace': ["users", split.user_workspace], + 'acl_notebooks':["users", split.acl_notebooks], + 'acl_directories':["users", split.acl_directories], + 'metastore': ["metastore", split.metastore], + 'success_metastore': ["metastore", split.success_metastore], + 'table_acls':["metastore", split.table_acls] + } + print("-"*80) + print(f"Starting with workspace {workspace}...") + self.create_workspace(workspace, checkpoint) + + @staticmethod + def create_workspace(wk="test", checkpoint=""): + """ + summary: creates a directory for each workspace + """ + directories = os.listdir("./logs/") + name = checkpoint+"_"+wk + if name not in directories: + os.mkdir("./logs/"+name) + #print("Workspace directory {} was successfully created.".format(name)) + + def copy_other_files(self): + """ + summary: copy files that need to be copied to all workspace folders + """ + total = ['app_logs', 'checkpoint', 'database_details.log', 'source_info.txt'] + for w in self.workspaces: + # don't copy the logs that were not in the csvs directory + total_in_workspace = os.listdir("./logs/"+self.checkpoint+"_"+w) + for file in total: + if file not in self.workspaces: + try: + # if it is a file, copy just that file. otherwise, copy all files recursively in it + if os.path.isfile("./logs/"+self.checkpoint+"/"+file): + #print(f"Copying file {file} to workspace {w}") + shutil.copy("./logs/"+self.checkpoint+"/"+file, "./logs/"+self.checkpoint+"_"+w+"/"+file) + else: + #print(f"Copying directory {file} to workspace {w}") + shutil.copytree("./logs/"+self.checkpoint+"/"+file, "./logs/"+self.checkpoint+"_"+w+"/"+file) + except Exception as e: + pass + + def run(self): + """ + summary: run each module for every asset + """ + # for each + for m in self.map.keys(): + print(f"{datetime.now()} Starting with {m}...") + try: + # get the asset function that splits that asset + module_function = self.map[m][1] + # get the appropriate csv that matches it + sheet = self.map[m][0] + # split_csv performs the actual split and outputs all csvs that were not in the csv directory + success = self.split_csv(m, module_function, sheet) + except Exception as e: + pass + return 0 + + def split_csv(self, module, module_function, sheet_name): + # reads csv and inputs attribute columns where the workspace column is set to Y + # you can set that variable to True or 1 or anything else that the client is using + # but it will ignore anything else + df = pd.read_excel("asset_mapping.xlsx", sheet_name = sheet_name) + #df = pd.read_csv("./csv/"+csv, index_col=0) + current_df = df[df[self.workspace] == "Y"] + # send that subset dataframe to the module function found in Split class + errors = module_function(current_df.reset_index()) + + #pushing all errors to a csv + if 'errors' not in self.new_path: + os.mkdir(self.new_path + 'errors') + pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name) + # success should be 0 + return 0 diff --git a/utils/split_logs.py b/utils/split_logs.py new file mode 100644 index 00000000..04f586b8 --- /dev/null +++ b/utils/split_logs.py @@ -0,0 +1,553 @@ +import json +import os +import shutil +import pandas as pd +import gzip + +class Split(): + def __init__(self, checkpoint, workspace): + self.path = "./logs/"+checkpoint+"/" + self.workspace = workspace + self.new_path = "./logs/"+checkpoint+"_"+workspace+"/" + self.imported_users = [] + self.imported_groups = ['admins', 'Users'] + + def read_log(self, file_name): + """ + summary: reads a given log + """ + try: + with open(self.path+file_name) as f: + data = f.read().split("\n") + return data + except FileNotFoundError as e: + return print(f"File {file_name} not found. ") + except Exception as e: + print(f"There was an error while reading {file_name}. ") + #print(e) + return '' + + def write_logs(self, log, file_name): + """ + summary: function to write a dict to a 'json' log in the same way that + the original logs are written + """ + file_path = self.new_path+file_name + + with open(file_path, 'w') as f: + for l in log: + f.write(json.dumps(l) + '\n') + + def fix_acls(self, acls): + new_acls = [] + for permission in acls: + if 'group_name' in permission.keys(): + if permission['group_name'] in self.imported_groups: + new_acls.append(permission) + if 'user_name' in permission.keys(): + if permission['user_name'] in self.imported_users: + new_acls.append(permission) + if 'principal' in permission.keys(): + if permission['principal'] in self.imported_users: + new_acls.append(permission) + if 'display' in permission.keys(): + if permission['display'] in self.imported_users: + new_acls.append(permission) + + return new_acls + + def users(self, df, file_name="users.log"): + self.imported_users = [] + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['userName'] in df['userName'].tolist(): + data_write.append(d) + self.imported_users.append(d['userName']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + + self.write_logs(data_write, file_name) + return errors + + + def instance_pools(self, df, file_name="instance_pools.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['instance_pool_id'] in df['instance_pool_id'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + + def secret_scopes(self, df, file_name=None): + scopes = df["secret_scope_names"] + errors = {'Data':[], 'Error':[]} + for scope in scopes: + try: + if "secret_scopes" not in os.listdir(self.new_path): + os.mkdir(self.new_path+"secret_scopes") + new_file_path = self.new_path+"secret_scopes/"+scope + src_path = self.path+"secret_scopes/"+scope + shutil.copyfile(src_path,new_file_path) + except Exception as e: + errors['Data'].append(scope) + errors['Error'].append(e) + return errors + + def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['scope_name'] in df['secret_scope_names'].tolist(): + data_write.append(d) + if "items" in d.keys(): + d['items'] = self.fix_acls(d['items']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def clusters(self, df, file_name = "clusters.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['cluster_name'] in df['cluster_name'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def cluster_policy(self, df, file_name = "cluster_policies.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['policy_id'] in df['policy_id'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def acl_clusters(self, df, file_name = "acl_clusters.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + cluster = d['object_id'].split("/")[-1] + if cluster in df['cluster_id'].tolist(): + data_write.append(d) + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def acl_cluster_policies(self, df, file_name = "acl_cluster_policies.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + policy = d['object_id'].split("/")[-1] + if policy in df['policy_id'].tolist(): + data_write.append(d) + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def jobs(self, df, file_name="jobs.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['job_id'] in df['job_ids'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def acl_jobs(self, df, file_name="acl_jobs.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + jobid = d['object_id'].split("/")[-1] + if int(jobid) in df['job_ids'].tolist(): + data_write.append(d) + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def instance_profiles(self, df, file_name="instance_profiles.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['instance_profile_arn'] in df['instance_profile_arn'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def mounts(self, df, file_name='mounts.log'): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if d['path'] in df.loc[(df['mount_paths'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def groups(self, df, file_name=None): + groups = df['group_name'] + errors = {'Data':[], 'Error':[]} + + for group in groups: + try: + if "groups" not in os.listdir(self.new_path): + os.mkdir(self.new_path + "groups/") + new_file_path = self.new_path + "groups/" + src_path = self.path + "groups/" + group + + group_data = self.read_log("groups/" + group) + group_data_write = [] + for d in group_data: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if "members" in d.keys(): + d['members'] = self.fix_acls(d['members']) + group_data_write.append(d) + self.write_logs(group_data_write, "groups/" + group) + except Exception as e: + errors['Data'].append(group) + errors['Error'].append(e) + all_groups = os.listdir(self.path + "groups") + self.imported_groups = [g for g in all_groups if g in groups ] + return errors + + def user_dirs(self, df=None, file_name="user_dirs.log"): + data_user = df + user_names = data_user['userName'].tolist() + if "global_shared_logs" in os.listdir("./csv/"): + data_art = pd.read_csv('./csv/global_shared_logs.csv', index_col=0) + art_names = data_art['global_shared_folder_names'].tolist() + else: + data_art = [] + art_names = [] + if "shared_logs" in os.listdir("./csv/"): + data_shared = pd.read_csv('./csv/shared_logs.csv', index_col=0) + shared_names = data_shared['notebook_names'].tolist() + else: + data_shared = [] + shared_names = [] + data = self.read_log(file_name) + user_paths=['/Users/'+ n for n in user_names] + shared_paths=['/Shared/'+ n for n in shared_names] + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + if d != '': + try: + d = json.loads(d) + path = str(d['path']) + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def user_workspace(self, df, file_name="user_workspace.log"): + data_user = df + user_names = data_user['userName'].tolist() + + if "global_shared_logs" in os.listdir("./csv/"): + data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) + art_names = data_art['global_shared_folder_names'].tolist() + else: + data_art = [] + art_names = [] + if "shared_logs" in os.listdir("./csv/"): + data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + shared_names = data_shared['notebook_names'].tolist() + else: + data_shared = [] + shared_names = [] + data = self.read_log(file_name) + user_paths=['/Users/'+ n for n in user_names] + shared_paths=['/Shared/'+ n for n in shared_names] + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + if d != '': + try: + d = json.loads(d) + path = str(d['path']) + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def shared_notebooks(self, df, file_name=None): + names = df['notebook_names'] + errors = {'Data':[], 'Error':[]} + for notebook in names: + try: + if "artifacts" not in os.listdir(self.new_path): + os.mkdir(self.new_path+'artifacts') + if "Shared" not in os.listdir(self.new_path+"artifacts/Shared/"): + os.mkdir(self.new_path+'artifacts/Shared/') + new_folder_path = self.new_path+'artifacts/Shared/'+notebook + src_path = self.path+'artifacts/Shared/'+notebook + shutil.copytree(src_path,new_folder_path) + except Exception as e: + errors['Data'].append(notebook) + errors['Error'].append(e) + return errors + + def global_notebooks(self, df, file_name=None): + names = df['global_shared_folder_names'] + errors = {'Data':[], 'Error':[]} + for notebook in names: + try: + if "artifacts" not in os.listdir(self.new_path): + os.mkdir(self.new_path+'artifacts') + new_folder_path = self.new_path+'artifacts/'+notebook + src_path = self.path+'artifacts/'+notebook + shutil.copytree(src_path,new_folder_path) + except Exception as e: + errors['Data'].append(notebook) + errors['Error'].append(e) + return errors + + def user_notebooks(self, df, file_name=None): + errors = {'Data':[], 'Error':[]} + for u in self.imported_users: + try: + if "artifacts" not in os.listdir(self.new_path): + os.mkdir(self.new_path+'artifacts') + if "Users" not in os.listdir(self.new_path + "artifacts/"): + os.mkdir(self.new_path+'artifacts/Users/') + + new_folder_path = self.new_path+'artifacts/Users/'+u + src_path = self.path+'artifacts/Users/'+u + shutil.copytree(src_path,new_folder_path) + except Exception as e: + errors['Data'].append(u) + errors['Error'].append(e) + return errors + + def acl_notebooks(self, df, file_name="acl_notebooks.log"): + data_user = df + user_names = data_user['userName'].tolist() + if "global_shared_logs" in os.listdir("./csv/"): + data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) + art_names = data_art['global_shared_folder_names'].tolist() + else: + data_art = [] + art_names = [] + if "shared_logs" in os.listdir("./csv/"): + data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + shared_names = data_shared['notebook_names'].tolist() + else: + data_shared = [] + shared_names = [] + data = self.read_log(file_name) + user_paths=['/Users/'+ n for n in user_names] + shared_paths=['/Shared/'+ n for n in shared_names] + data_write = [] + errors = {'Data':[], 'Error':[]} + for d in data: + if d != '': + try: + d = json.loads(d) + path = str(d['path']) + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + data_write.append(d) + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def acl_directories(self, df, file_name="acl_directories.log"): + data_user = df + user_names = data_user['userName'].tolist() + if "global_shared_logs" in os.listdir("./csv/"): + data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) + art_names = data_art['global_shared_folder_names'].tolist() + else: + data_art = [] + art_names = [] + if "shared_logs" in os.listdir("./csv/"): + data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + shared_names = data_shared['notebook_names'].tolist() + else: + data_shared = [] + shared_names = [] + data = self.read_log(file_name) + user_paths=['/Users/'+ n for n in user_names] + shared_paths=['/Shared/'+ n for n in shared_names] + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + if d != '': + try: + d = json.loads(d) + path = str(d['path']) + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + data_write.append(d) + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return errors + + def metastore(self, df, file_name=None): + databases = os.listdir(self.path + "metastore/") + errors = {'Data':[], 'Error':[]} + + for db in df['metastore_database']: + try: + if "metastore" not in os.listdir(self.new_path): + os.mkdir(self.new_path+"metastore/") + new_folder_path = self.new_path+"metastore/"+db + src_path = self.path+"metastore/"+db + if db not in os.listdir(self.new_path+"metastore/"): + shutil.copytree(src_path, new_folder_path) + except Exception as e: + errors['Data'].append(db) + errors['Error'].append(e) + return errors + + def success_metastore(self, df, file_name='success_metastore.log'): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + database = d['table'].split(".")[0] + if len(df.loc[(df['metastore_database'] == database)]) > 0: + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return 0 + + def table_acls(self, df, file_name="logs/table_acls/00_table_acls.json.gz"): + errors = {'Data':[], 'Error':[]} + with gzip.open(file_name, 'rb') as f_in: + with open(self.path+"table_acls/00_table_acls.json", "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + data = self.read_log('table_acls/00_table_acls.json') + data_write = [] + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + if len(df.loc[(df['metastore_database'] == d['Database'])]) > 0: + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + if "table_acls" not in os.listdir(self.new_path): + os.mkdir(self.new_path+"table_acls") + file_path = self.new_path+"table_acls/00_table_acls.json" + with open(file_path, 'w') as f: + json.dump(data_write, f) + return errors diff --git a/utils/to_csv.py b/utils/to_csv.py new file mode 100644 index 00000000..5523ac61 --- /dev/null +++ b/utils/to_csv.py @@ -0,0 +1,229 @@ +import pandas as pd +import json +import argparse +import os + +def read_log(file_name, checkpoint): + try: + with open ("logs/" + checkpoint + "/" + file_name) as f: + data = f.read().split("\n") + return data[:-1] + + except FileNotFoundError as e: + return '' + except Exception as e: + print(f"Error while reading {file_name}...") + return '' + +def save_to_csv(data, file_name): + try: + pd.DataFrame.from_dict(data).to_csv("./csv/" + file_name) + except: + print(f"Error while writing {file_name}...") + + +def create_instance_profiles(data): + instance_profile_arn = [] + for d in data: + try: + d = json.loads(d) + instance_profile_arn.append(d['instance_profile_arn']) + except Exception as e: + pass + return {'instance_profile_arn': instance_profile_arn} + +def create_instance_pools(data): + instance_pool_name = [] + instance_pool_id = [] + + for d in data: + try: + d = json.loads(d) + instance_pool_name.append(d['instance_pool_name']) + instance_pool_id.append(d['instance_pool_id']) + except Exception as e: + pass + + return {'instance_pool_name': instance_pool_name, 'instance_pool_id': instance_pool_id} + +def create_users(data): + userName = [] + displayName = [] + + for d in data: + try: + d = json.loads(d) + if "userName" in d: + userName.append(d['userName']) + else: + userName.append(" ") + if "displayName" in d: + displayName.append(d['displayName']) + else: + displayName.append(" ") + + except Exception as e: + pass + + return {'userName': userName, 'displayName': displayName} + +def read_group(group_path): + try: + with open(group_path) as f: + data = f.read().split("\n") + return data + except FileNotFoundError as e: + return '' + except Exception as e: + print(f"Error while reading {group_path}...") + print(e) + return '' + +def create_groups(checkpoint = "", directory_name = "groups"): + groups_path = f"./logs/{checkpoint}/{directory_name}/" + groups_dir = os.listdir(groups_path) + groups = {} + + for g in groups_dir: + group_roles = [] + group_members = [] + group_users = [] + + data = read_group(groups_path + g) + data = data[0] + d = json.loads(data) + group_name = d['displayName'] + + try: + roles = d['roles'] + for role in roles: + group_roles.append(role['value']) + except: + pass + + try: + members = d['members'] + for member in members: + group_members.append(member['display']) + group_users.append(member['userName']) + except: + pass + + groups[group_name] = [group_roles, group_members, group_users] + results = {} + total_names = [] + total_group_roles = [] + total_group_members = [] + total_group_users = [] + + for k,v in groups.items(): + total_names.append(k) + total_group_roles.append(v[0]) + total_group_members.append(v[1]) + total_group_users.append(v[2]) + return {'group_name': total_names, 'group_roles': total_group_roles, 'group_members': total_group_members, 'group_users': total_group_users } + + +def create_clusters(data): + cluster_id = [] + cluster_name = [] + creator_user_name = [] + policy_id = [] + + for d in data: + try: + d = json.loads(d) + cluster_id.append(d['cluster_id']) + cluster_name.append(d['cluster_name']) + creator_user_name.append(d['creator_user_name']) + if "policy_id" in d.keys(): + policy_id.append(d['policy_id']) + else: + policy_id.append(" ") + except Exception as e: + print("Error in creating clusters...") + + return {'cluster_id': cluster_id, 'cluster_name': cluster_name, 'creator_user_name': creator_user_name, 'policy_id': policy_id} + +def create_jobs(data): + job_ids = [] + job_names = [] + job_types = [] + job_owners = [] + + for d in data: + try: + d = json.loads(d) + job_ids.append(d['job_id']) + jn = d['settings']['name'] + job_names.append(jn[:jn.index('::')]) + try: + job_types.append(d['settings']['format']) + except: + job_types.append('N/A') + try: + job_owners.append(d['creator_user_name']) + except: + job_owners.append('N/A') + except Exception as e: + print("Error in creating jobs...") + return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_owners } + + +def create_shared_logs(checkpoint = "", directory_name = "artifacts/shared"): + shared_path = f"./logs/{checkpoint}/{directory_name}" + notebooks = os.listdir(shared_path) + + return {"notebook_names" : notebooks} + +def create_other_artifacts(checkpoint = "", directory_name = "artifacts"): + other_path = f"./logs/{checkpoint}/{directory_name}" + notebooks = os.listdir(other_path) + if "Users" in notebooks: + notebooks.remove("Users") + if "Shared" in notebooks: + notebooks.remove("Shared") + + return {"global_folder_names" : notebooks} + +def create_libraries(data): + library_paths = [] + library_names = [] + for d in data: + if len(d) > 0: + try: + d = json.loads(d) + library_paths.append(d['path']) + library_names.append(d['path'].split("/")[-1]) + except Exception as e: + print("Error in creating libraries...") + + return {'library_paths': library_paths, 'library_names': library_names} + +def create_scopes(checkpoint = "", directory_name = "secret_scopes"): + try: + secrets = os.listdir(f"./logs/{checkpoint}/{directory_name}/") + return {"secret_scopes" : secrets} + except: + print("Error while reading secrets directory...") + +def create_mounts(data): + mount_paths = [] + mount_sources = [] + + for d in data: + try: + d = json.loads(d) + mount_paths.append(d['path']) + mount_sources.append(d['source']) + except Exception as e: + print("Error in mounts...") + + return { 'mount_paths' : mount_paths, 'mount_sources' : mount_sources } + + +def create_metastore(checkpoint = "", directory_name = 'metastore'): + metastore_path = f"./logs/{checkpoint}/{directory_name}" + metastore_database = [i for i in os.listdir(metastore_path)] + + return {'metastore_database' : metastore_database} From 5b3df0efb1c4c528239ad6ac5abccd27da0127c8 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 19 Dec 2022 12:10:22 -0500 Subject: [PATCH 002/111] updating workspace mapping scripts --- notebook_filter.ipynb | 302 ++++++++++++++++++++++++++++++ utils/create_workspace.py | 9 +- utils/split_logs.py | 6 +- workspace_mapping_instructions.md | 116 ++++++++++++ 4 files changed, 425 insertions(+), 8 deletions(-) create mode 100644 notebook_filter.ipynb create mode 100644 workspace_mapping_instructions.md diff --git a/notebook_filter.ipynb b/notebook_filter.ipynb new file mode 100644 index 00000000..610c7739 --- /dev/null +++ b/notebook_filter.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import shutil\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Prevent wide columns from being cut off\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Moving notebooks we want to keep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# /Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "source_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old'\n", + "dest_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Move artifacts folder to artifacts_old\n", + "# This way the filtered folder is called artifacts and the import tool will use the correct one\n", + "# source/dest reads backwards for this step but makes sense going forward \n", + "\n", + "if not Path(source_folder).is_dir() and Path(dest_folder).is_dir():\n", + " shutil.move(dest_folder, source_folder)\n", + "else:\n", + " raise Exception('Make sure to have notebooks in a folder called artifacts and delete the folder artifacts_old if it already exists')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in notebooks to keep and rename 'notebook' column to 'source' for disambiguation\n", + "df = pd.read_csv('keep_notebooks.csv', delimiter='\\t')\n", + "df.rename(columns={'notebook':'source'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new dataframe where source and dest are Path objects instead of strings\n", + "# Append '.dbc' to each filename\n", + "\n", + "keep_df = pd.DataFrame()\n", + "keep_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))\n", + "keep_df['dest'] = df['source'].apply(lambda x: Path(dest_folder + x + '.dbc'))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/E2Migration/Migrate/Groups.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/TERMite/Concordancer.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/PlumX/Match_with_PlumX_Data.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_paul.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/ARC_Indexing_30sept.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_luigi_with_citations.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_luigi.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Shared/ResearchMetrics/Epics/No-Epic/Twitter CM Notebook.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/QA on TRO matched and unmatched.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/30sept_mainTables.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/ARC_ERA2018_matching.dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/Q1B_Citation_Linking_Analysis_(Crhis_Rosin).dbc'\n", + "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/luigi_unmatched_TRO_exploration.dbc'\n", + "13 files failed to copy\n" + ] + } + ], + "source": [ + "# Loop through each row and copy the file in location 'source' to the location 'dest'\n", + "# Creates dest directories if they don't exist\n", + "\n", + "counter = 0\n", + "\n", + "for _, row in keep_df.iterrows():\n", + " row['dest'].parent.mkdir(parents=True, exist_ok=True)\n", + " try:\n", + " shutil.copy(row['source'], row['dest'])\n", + " except Exception as e:\n", + " counter += 1\n", + " print(e)\n", + "\n", + "print (f\"{counter} files failed to copy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13 notebooks were not copied\n" + ] + } + ], + "source": [ + "# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/ and all subfolders\n", + "all_notebooks = Path(dest_folder).glob('**/*.dbc')\n", + "all_notebooks = [x for x in all_notebooks if x.is_file()]\n", + "\n", + "# Compare how many .dbc files are in the dest folder against the number of rows in keep_df\n", + "if len(all_notebooks) == keep_df.shape[0]:\n", + " print('All notebooks copied successfully')\n", + "else:\n", + " print(f'{keep_df.shape[0] - len(all_notebooks)} notebooks were not copied')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Archiving Notebooks" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "archive_folder = \"/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts/Archive\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in notebooks to archive and rename 'notebook' column to 'source' for disambiguation\n", + "df = pd.read_csv('archive_notebooks.csv', delimiter='\\t')\n", + "df.rename(columns={'notebook':'source'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new dataframe where source and dest are Path objects instead of strings\n", + "# Append '.dbc' to each filename\n", + "\n", + "archive_df = pd.DataFrame()\n", + "archive_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))\n", + "archive_df['dest'] = df['source'].apply(lambda x: Path(archive_folder + x + '.dbc'))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 files failed to copy\n" + ] + } + ], + "source": [ + "# Loop through each row and copy the file in location 'source' to the location 'dest'\n", + "# Creates destination directories if they don't exist\n", + "\n", + "counter = 0\n", + "\n", + "for _, row in archive_df.iterrows():\n", + " row['dest'].parent.mkdir(parents=True, exist_ok=True)\n", + " try:\n", + " shutil.copy(row['source'], row['dest'])\n", + " except Exception as e:\n", + " counter += 1\n", + " print(e)\n", + "\n", + "print (f\"{counter} files failed to copy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All archived notebooks copied successfully\n" + ] + } + ], + "source": [ + "# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/archive/ and all subfolders\n", + "all_archived_notebooks = Path(archive_folder).glob('**/*.dbc')\n", + "all_archived_notebooks = [x for x in all_archived_notebooks if x.is_file()]\n", + "\n", + "# Compare how many .dbc files are in the archive folder against the number of rows in archive_df\n", + "if len(all_archived_notebooks) == archive_df.shape[0]:\n", + " print('All archived notebooks copied successfully')\n", + "else:\n", + " print(f'{archive_df.shape[0] - len(all_archived_notebooks)} notebooks were not copied')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.12 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils/create_workspace.py b/utils/create_workspace.py index 6c185862..97c25e84 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -3,7 +3,7 @@ import json import shutil import pandas as pd -import datetime +from datetime import datetime class Workspace(): def __init__(self, checkpoint, workspace, all_workspaces): @@ -41,7 +41,7 @@ def __init__(self, checkpoint, workspace, all_workspaces): 'table_acls':["metastore", split.table_acls] } print("-"*80) - print(f"Starting with workspace {workspace}...") + print(f"CREATING WORKSPACE {workspace}...") self.create_workspace(workspace, checkpoint) @staticmethod @@ -82,7 +82,7 @@ def run(self): """ # for each for m in self.map.keys(): - print(f"{datetime.now()} Starting with {m}...") + print(f"{datetime.now()} Starting to split {m}.") try: # get the asset function that splits that asset module_function = self.map[m][1] @@ -103,10 +103,9 @@ def split_csv(self, module, module_function, sheet_name): current_df = df[df[self.workspace] == "Y"] # send that subset dataframe to the module function found in Split class errors = module_function(current_df.reset_index()) - #pushing all errors to a csv if 'errors' not in self.new_path: os.mkdir(self.new_path + 'errors') - pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name) + pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name + '.csv') # success should be 0 return 0 diff --git a/utils/split_logs.py b/utils/split_logs.py index 04f586b8..09c438b8 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -66,9 +66,9 @@ def users(self, df, file_name="users.log"): if len(d) != 0: d = d.strip() d = json.loads(d) - if d['userName'] in df['userName'].tolist(): + if d['emails'][0]['value'] in df['userName'].tolist(): data_write.append(d) - self.imported_users.append(d['userName']) + self.imported_users.append(d['emails'][0]['value']) except Exception as e: errors['Data'].append(d) errors['Error'].append(e) @@ -268,7 +268,7 @@ def mounts(self, df, file_name='mounts.log'): if len(d) != 0: d = d.strip() d = json.loads(d) - if d['path'] in df.loc[(df['mount_paths'].tolist(): + if d['path'] in df['mount_paths'].tolist(): data_write.append(d) except Exception as e: errors['Data'].append(d) diff --git a/workspace_mapping_instructions.md b/workspace_mapping_instructions.md new file mode 100644 index 00000000..8be96a2a --- /dev/null +++ b/workspace_mapping_instructions.md @@ -0,0 +1,116 @@ +### Virtual Environment +Use the requirements file to install all appropriate packages. For example, in Conda, you can install the packages while creating the environment like this: + +``` +conda create --name --file requirements.txt +```` + +# Workspace Mapping +## Logs to CSV + +Run the **convert_all_logs.py** file. This will result in a directory _/csv_ with all of the necessary csvs and excel document _asset_mapping.xslx_ that contains all of the csvs as spreadsheets. These csvs will used to manually allocate certain resources to each workspace. + +``` +python convert_all_logs.py +``` + +Please keep insert all of the files directly into the migrate folder. Do not put it in the logs directory or a specific checkpoint. The migrate should look like this: + +```bash +├── logs +│   ├── clusters.log +│   ├── groups +│   │   ├── ... +│   ├── instance_pools.log +│   ├── instance_profiles.log +│   ├── jobs.log +│   ├── libraries.log +│   ├── secret_scopes +│   │   ├── ... +│   ├── users.log +├── convert_all_logs.py +└── utils +``` + +After running the scripts, you should see a _csv_ directory with the csvs. + +```bash +├── csv +│   ├── users.csv +│   ├── global_shared_logs.csv +│   ├── instance_pools.csv +│   ├── libraries.csv +│   ├── jobs.csv +│   ├── secret_scopes.csv +│   ├── clusters.csv +│   ├── instance_profiles.csv +│   ├── mounts.csv +│   ├── metastore.csv +│   ├── groups.csv +│   ├── shared_logs.csv +``` +## Manual Resource Mapping + +Directly using the csvs, allocate where each resource will be moved. Add the workspace to each csv under a column titled **workspace**. + +## Mapping + +Run the map.py file. Please note the session name (this is the name of the directory below the logs directory that contains the logs) and enter it using the parameter _checkpoint_. List all of the workspaces with a space using the parameter _workspace_. This script will take in the csvs and split the logs to each workspace, located in a different directory. + +``` +python map.py --checkpoint [SESSION NAME] --workspace [WORKSPACE1 WORKSPACE2 ..] +``` + +This assumes that the folder /logs is located in the same directory as map.py. Please do not change headings in the csvs as these headings are referenced in the mapping. + +This is what the directory should look like: + +```bash +├── csv +│   ├── users.csv +│   ├── global_shared_logs.csv +│   ├── instance_pools.csv +│   ├── libraries.csv +│   ├── jobs.csv +│   ├── secret_scopes.csv +│   ├── clusters.csv +│   ├── instance_profiles.csv +│   ├── mounts.csv +│   ├── metastore.csv +│   ├── groups.csv +│   ├── shared_logs.csv +├── logs +│   ├── [session name] +│   │ ├── users.log +│   │ ├── clusters.log +│   │ ├── user_dirs.log +│   │ ├── ... +├── map.py +├── utils +│   ├── create_workspace.py +│   ├── split_logs.py + +``` + +After running the map.py file, your directory should look like this. Each workspace should have their own unique session name (whatever the session name was concatenated with the workspace name). This should allow you to import the logs directly using that unique session name. + +```bash +├── csv +│   ├── ... +├── map.py +├── utils +├── logs +│   ├── [session name]_workspace1 +│   │ ├── users.log +│   │ ├── clusters.log +│   │ ├── user_dirs.log +│   │ ├── ... +│   ├── [session name]_workspace2 +│   │ ├── users.log +│   │ ├── clusters.log +│   │ ├── user_dirs.log +│   │ ├── ... +│   ├── ... +└── ... + +``` From 8b06df934d5f30be5feb0f372c0fe87148102972 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 19 Dec 2022 12:12:02 -0500 Subject: [PATCH 003/111] delete unnecessary files --- notebook_filter.ipynb | 302 ------------------------------------------ 1 file changed, 302 deletions(-) delete mode 100644 notebook_filter.ipynb diff --git a/notebook_filter.ipynb b/notebook_filter.ipynb deleted file mode 100644 index 610c7739..00000000 --- a/notebook_filter.ipynb +++ /dev/null @@ -1,302 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import shutil\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Prevent wide columns from being cut off\n", - "pd.set_option(\"display.max_colwidth\", None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Moving notebooks we want to keep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# /Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "source_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old'\n", - "dest_folder = '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts'" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Move artifacts folder to artifacts_old\n", - "# This way the filtered folder is called artifacts and the import tool will use the correct one\n", - "# source/dest reads backwards for this step but makes sense going forward \n", - "\n", - "if not Path(source_folder).is_dir() and Path(dest_folder).is_dir():\n", - " shutil.move(dest_folder, source_folder)\n", - "else:\n", - " raise Exception('Make sure to have notebooks in a folder called artifacts and delete the folder artifacts_old if it already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Read in notebooks to keep and rename 'notebook' column to 'source' for disambiguation\n", - "df = pd.read_csv('keep_notebooks.csv', delimiter='\\t')\n", - "df.rename(columns={'notebook':'source'}, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a new dataframe where source and dest are Path objects instead of strings\n", - "# Append '.dbc' to each filename\n", - "\n", - "keep_df = pd.DataFrame()\n", - "keep_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))\n", - "keep_df['dest'] = df['source'].apply(lambda x: Path(dest_folder + x + '.dbc'))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/E2Migration/Migrate/Groups.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Users/d.mcbeath@elsevier.com/TERMite/Concordancer.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/PlumX/Match_with_PlumX_Data.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_paul.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/ARC_Indexing_30sept.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_luigi_with_citations.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/stats_luigi.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/Shared/ResearchMetrics/Epics/No-Epic/Twitter CM Notebook.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/QA on TRO matched and unmatched.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/30sept_mainTables.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/ARC_ERA2018_matching.dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/Preliminary report (30 Sept. 2022)/Q1B_Citation_Linking_Analysis_(Crhis_Rosin).dbc'\n", - "[Errno 2] No such file or directory: '/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts_old/rads/projects/2022_AUS_ARC/TRO analysis/luigi_unmatched_TRO_exploration.dbc'\n", - "13 files failed to copy\n" - ] - } - ], - "source": [ - "# Loop through each row and copy the file in location 'source' to the location 'dest'\n", - "# Creates dest directories if they don't exist\n", - "\n", - "counter = 0\n", - "\n", - "for _, row in keep_df.iterrows():\n", - " row['dest'].parent.mkdir(parents=True, exist_ok=True)\n", - " try:\n", - " shutil.copy(row['source'], row['dest'])\n", - " except Exception as e:\n", - " counter += 1\n", - " print(e)\n", - "\n", - "print (f\"{counter} files failed to copy\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "13 notebooks were not copied\n" - ] - } - ], - "source": [ - "# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/ and all subfolders\n", - "all_notebooks = Path(dest_folder).glob('**/*.dbc')\n", - "all_notebooks = [x for x in all_notebooks if x.is_file()]\n", - "\n", - "# Compare how many .dbc files are in the dest folder against the number of rows in keep_df\n", - "if len(all_notebooks) == keep_df.shape[0]:\n", - " print('All notebooks copied successfully')\n", - "else:\n", - " print(f'{keep_df.shape[0] - len(all_notebooks)} notebooks were not copied')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Archiving Notebooks" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "archive_folder = \"/Users/robbfournier/Desktop/E2_Migrations/Elsevier/logs/STexport2/artifacts/Archive\"" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Read in notebooks to archive and rename 'notebook' column to 'source' for disambiguation\n", - "df = pd.read_csv('archive_notebooks.csv', delimiter='\\t')\n", - "df.rename(columns={'notebook':'source'}, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a new dataframe where source and dest are Path objects instead of strings\n", - "# Append '.dbc' to each filename\n", - "\n", - "archive_df = pd.DataFrame()\n", - "archive_df['source'] = df['source'].apply(lambda x: Path(source_folder + x + '.dbc'))\n", - "archive_df['dest'] = df['source'].apply(lambda x: Path(archive_folder + x + '.dbc'))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 files failed to copy\n" - ] - } - ], - "source": [ - "# Loop through each row and copy the file in location 'source' to the location 'dest'\n", - "# Creates destination directories if they don't exist\n", - "\n", - "counter = 0\n", - "\n", - "for _, row in archive_df.iterrows():\n", - " row['dest'].parent.mkdir(parents=True, exist_ok=True)\n", - " try:\n", - " shutil.copy(row['source'], row['dest'])\n", - " except Exception as e:\n", - " counter += 1\n", - " print(e)\n", - "\n", - "print (f\"{counter} files failed to copy\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All archived notebooks copied successfully\n" - ] - } - ], - "source": [ - "# Checking that all notebooks were copied by making a list of every file with a .dbc file extension in /artifacts/archive/ and all subfolders\n", - "all_archived_notebooks = Path(archive_folder).glob('**/*.dbc')\n", - "all_archived_notebooks = [x for x in all_archived_notebooks if x.is_file()]\n", - "\n", - "# Compare how many .dbc files are in the archive folder against the number of rows in archive_df\n", - "if len(all_archived_notebooks) == archive_df.shape[0]:\n", - " print('All archived notebooks copied successfully')\n", - "else:\n", - " print(f'{archive_df.shape[0] - len(all_archived_notebooks)} notebooks were not copied')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.9.12 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From d0cdf0a6e114dd31822fd1bddb27c1b6144445d5 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 19 Dec 2022 12:13:54 -0500 Subject: [PATCH 004/111] adding util notebooks --- Root Hive Migration.dbc | Bin 0 -> 6020 bytes Workspace Sizing Notebook.html | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 Root Hive Migration.dbc create mode 100644 Workspace Sizing Notebook.html diff --git a/Root Hive Migration.dbc b/Root Hive Migration.dbc new file mode 100644 index 0000000000000000000000000000000000000000..e9323d607f1c36357f7cc0cda663c5d0b7d53ba4 GIT binary patch literal 6020 zcmai&Wl$VilZGd_GXxn3gG+FS;BLW!TX1J^8(@$Dg1c*Q*C4@yUo5x>cY*|Wht2)9 z-)_~`zFntIcb$5>`se9KRS^-10Dy*u2B3G-(*pd5Fabya2UAB|D@!+bHV3PhVE}+N zS^y`25(s#$=e3j8k${_Q((n^Wy`eRVIX_ijP_fz#%(|sX#p14_AM74PM`&VxP=G;a zVIn3br14eMv%_!IKYTov`)#uWNfU8YyjH{bN$erJA&ok zqxydiM*a6y-O0(FQr;G7NvUjW?P}_7>*PquN~vvW>1fL4?Coyjwa$P{Oc5Rf(OYJ8A%C{l+xbf>+p9Nd&V1!U4O)7{ zV^@E*r&Rf@+eRa?e`j=D3*D9LJ9}Q9fK=;UB3ipb=5U0vq1Ka*b`Bjv{Hl+ThTTn) zz%@`OMgHwLnFK}l#{imuK!U3bAa1WS$R`OoXEOFO%m(8ui+3h>tkycDvG-8Ub_sv2 z9im;Wm&F3M`9>m3aofnI_5-n3`>N(+vIOXsz&~Uaz8gXEy9>h-I&U%yxjukl)4OZcQf+Px% ziCthI9)(y>%MDS^cqTbMr2P^;qoG*Z8q6iXT89sMdJ!Y}b0CBh)g^?`*l)x~v!Z?y z7K;Wq z$uX8(XXaTG*fDx&J+q1?jYzp^?@Vy9QuW=&6!rqoaeW8RO;Qc3d1|`(CAr*|L4V4N zFZ0_kx%yqwO2I=$89ANs8N~bVskbJ;q~4U=D*Ilh!6HdJo?OvE7R$s#2!m073YX5d zww|c_v8L8+n$T!0fWZeGgIUp#7rs+7DBb9kfU^vV7P?g5*E%E^j`}RKe1rsWHAtm z#B7`ICc%oMWoz+iCak*|PF53~JctTn#PDlA)G0fi$o1JQI6+mubtH_HZ z_NeB?b%))KuJ2+Yo3mYUbOc3bcnov(&nBUTKP)+xcvFD@*m@8BJ{JmAuftKi8*<>$h5_JbY`C?pz$Rh7 zJ`fc*OrT6j?#^h(3$q?}6MRWLdNfl8<}fzb2GSC#2;7#ZEQVCAlq5rV>fL}=6p_0H zSZUHr*c7(9h1dEbXlEtkcQ*A*Q5k*lI3l`nsoJoyq{_syg^`h8vGoQjC5d~Ar-3&K$s2@vxMfck}@DAMBd zFLvW0IQNI*tLM5qhHr8bhsX=jeJ_K<2?8|{Rz-)@+`7stWAwpsF^W|N=8CxvNHp|H zQ+UArNQZQbr7|ilr(uWC^pMy`U%{uOgqGl`5JjKho3hfD+d^kr87n!c;Hzgs;Khjo zIvL+u$|tjH>r>N2l(4~nzFZs+OXGZti_bL9MoLEf5>VZbMB_ZVT-#sB zj>g!Xf{Lvj3-IyPr6#kbRkw8%O9xb%aP(L6{J}RJueeWQqvPKxD4Lh6octM7%&Ebs zB#^F;?cD#PEWJvf0B*$925zjTvLm7fC2;X=avI|iKU+wu>$2`QY>&VTqb}F|rQn^f z#_q5X751s~<>pfBSII%^J)P4q(EgvhW?U(x_0auXt;Stfwe6;BvIR}d@AuVv>pqMj z6uGI(zgjf7DS~U7>L<)OhY4BP8w2|z7Yea6a3b}E*evYK-%ZXbAp4f_kT$)(S z#YB7J#nuN=&NwVTbkQn??5{{%#<8tlpw_5!WT2 zRSM=2;d}*C%z5JeP#ze)0<1!X-+z4zt0#|l-jp>xKC7cwSv`QTN!ul5AJ$31&)wgR ztrWH1MEzc&n&~(7;;AsjEzu8fx;kWkoP1$+@9nXWpuxs?MfcYys(W#-LXY^2z;$_A z)O1LSBSbq@Zv9wbiJ~1jad&s+e#zSX>+4yyY1o$N7xpfwjTFCfPL3JV6nUBwMfdcl z3DK`P*z2*MCI)msy{5`1eL=Tzv{;L*k&&+UeLL!fn!9Is;cH_<`I% zC~ZuyP_go*#W5db9E(Agz5?fSsrYIvkShfHKkK;``pAwN&K&(;@Yk6fCY;&qjnQgW zSnE_eHg$v_8E{=i>)JZvBCwp@Ez}B+PzbWra^9ccxG_Eu04rn;dM;DoTn#GKabDv- z9H|>#UtvV|@1#_&6$JOL8&w(fUw1>#YboXG{WueZe=MhIRC)NEYNV5Q>49~jrPY<0 z)2E+xr%=Dda&agba%w$`&v=;KqUs-)w{DJEzHe<6?nas7BxiWnQ&M2QtQlRisAfN{ zgX$%|#3Z_Kva3F_W{_$~A~`8^9Lp&Z8k$Sh)mJY*0zT1bS}GlI7IVqDJ-D5d8_w;*R6cG+|&6!6zfUPWSvK0w4kI|JK=B@lysM^WOkh99NU3)kQ|^J zDe~rm(9zU?v9tp&BJH$YIhw@9&!?C1-!>b%l+ntiam0c-rr-B66Ifo5m*)XqhBg7W16PkbFvc2eaq3 z@=>d(R14Oock^Yu@G=OjY+sX;m&)asp78stUi^k> zv&Z3T&}Xe76@$Cml%`Q_=JJ>ncfHLqqHHz`yXEop&Bvas?k=6hH0$kqq`|Hv*pg>q z&JBfl$GB&q(VlmCa~5-nC#D2CH|l}dp$N}z-hrW?K$9M>^2Icti(5Ig%ECLnqLY?N zWTk>HufPu9Zr91C=E{+9-it0Er#-(C-5yWqWOOSXqN&@AJBFnM6YLa9qoCLHBO)bt zJ-O>~@akG0!`zF)hSzPf^F8&4TF*(df1__|*nIYf3p7ob^&W`yx0z(il-b*wA&S0S zm@hm*qILcBZr>=gH+1HAO`R%ko@TwAU7LafqJE#WZJTGIc9ZHU92}ot_eS*& zUqHTLgZSVCpQqYD3NG>)p~`bFMC%=Euo6@#Ut7q*;XU6QKnOqLkCmjnKL_So8F&O_ zSv6AabETgmm?(32iFB|Cagpb5tHqsaVD}wHr5$Q_eT__TQhc&}^oRjh4Zk!}f$riB zXvyQ8#pkKvL_HqaZkvDz^N4lok3K?R7-O$U)X0Ppd|iB!#q1#^_{bWI`sVWZ*)dXg zZkDTneX1N?u3hq!EGqAj5gjeEM7mu%iXlg}hg?A=*pn5;lc%P6?phWTq(LA(8Ky~%sp^J%5^zng({6>mw*TS4-MAA1?6 z31sNTFw{1fb|!jP#^y3kqxxJOxk$#YBGQZAj7g$W(RF!o2Q+OrF109iAZ}P)(Ffv` z*HM((S(N%BSCpCwTNd4~z;HBIoWfl~L^f}%#?rJ0+sMu%q?umMs*8MNby!}7b`^RM zj5T4)IN`zoU3QFC>(eMrrb|WE_G$V{LnWz^3cUXO7k%?T^R;NwGYkM+F07;vhH+7= z8#Wl@zUsT7bHB~*$>~!fEt)s3i|L8O-(5{Ui$imr2`2~PtB+~(-{OWQ0;j8eDS<1 z)T!Zm%P60ula9dviIz=+u)ybiW)RaSX|d)uw&lZn((E*s_hwDZj-MW!$V7 zmu?-~5cx}C`_8Y4BByDtRRsvEq4&;5O#Qv3cS!Jw?viJR&7UeO* zU;0VSA1+a)OY7pH;Qb<@6AB2Apfr>JNI* ziu&D@y-`RAC0BI=Wm%55;WF2qKm}QoRdXtSum?%bY(f~jdsoXNu8-d845W2@G~&;y z?4hVL^>n6$2{ynt57Evf1SF6wLZ9jrqaA+wlgol@*26FE=a1a`koTb8?Mf=aE^$!r zCyLllp#??5=A`l3xdXnl%bRETg7?n;slwtE`NYX`GyAf&$& zp%2pDm`D&CN<-dBr+~L8V08D%8{&Lxl2j@&q7j=~j=KD`=*^eR8Q~eEXH}Vb)54+rHdLS= zDC9t@35xXCwf3Us+3i4N>dU)+rN=jOXfYA&7psdP8yv6UV=pJ;pz<n0xv4NvbC_ z@;uAS%-I|FN-tbdxepbugeb<;W%!b1Qjt5s;sVDA+Rc;Z8IM;u-BVK>C`IGAZ|i(x1SOY<9{LRSX)< z9B-n-srC3N-;N>~HRIhi_n8Z!|B4mxjy8jXW_Vw9cx%iBVwBIWhZaCb4uW0xZ6 z3nVNAgWe&@r0|Hs2XOSK->MYqn=wW8F8?mf*e#Gsi!q585FD@Y`|eyzoAIkDp7x+$AnE}rjg7nexF{m#pw2dlA?j!YL&EvbdsIUgCj{` z-E^?Hl;~FDPu--mOnoS`s#kB|AlnAFzHT&E?_+X2wudPTWvS2je=PL=Sa{!Q%OkWiUKXm% z!N*%%O)5Pllv!d@=)mwLcJXAP0&+2>RKM~^We&CVBvj^R zm1@L!kZo=@Jc;h)@wBr1fwfRTdGX9MNJr_$5u5cK$VgeyipCHp#eQZngG9_mmTMBU zsbhvPE+N`Or(Gk{)j``zN7;0#1(DCTcj81;#50+vJGl^3PiYM7Ag{NKul;q}u`2_@NqW=Geh=nS4&(D}CwCg(+l;&3bF1ml)9>Gv_P0 z#D44985yxPlr2DXh!K-#mH$Oib(T$?=Pm_L#5&z%(5`x9XW@Ze0Wa1=U7YsCTz%v< zS+i|%T+qX;H!vcOtC_5Ij}ML8ZI2>%;9{!BmB_V97z^s1Di*=)3eGOHK!}ux);_@! zIh;xc7o@GsC%3@Sg(9t^YtTX9f(wtCb3HkMkK?e=E+oS3E7CVQlZEnhIpD`ir;lpc zmVvNn(vBSVsdv{TFM=g(=w*iDI;l4(|4_+eoxbmfx@g#i?p-Shu1+tlFn&yhZvNpU z{4=nWlR>6CFO&waCC4@@XdUgCy#}3L^tbt*v%x7(>z%H#5Iw^smjqh^43%}F5tA5} z-q_;KiXON(_Ux`%vY)eY@}W48c_yy~3aGlfi6;mCY9ds62(78~i^8fpZXa>1*=Nt+ zLwX{XNp>|RtsTVRP745&*3sM}{q62$L^oT+44WD zsS2Nidc_n7yYAb1lUav;k%VoPyE}r6|l1z^eUf=x7k-xq} z&b=c90D^u0<;ZUk2mt?$xc`$e|BJZ)r~Vi4{@>>RZ2T{V{%>XcP5*x=t*VHO@~;Qs NZ^Zn)H;MkO{tNQiL>2%5 literal 0 HcmV?d00001 diff --git a/Workspace Sizing Notebook.html b/Workspace Sizing Notebook.html new file mode 100644 index 00000000..889a5971 --- /dev/null +++ b/Workspace Sizing Notebook.html @@ -0,0 +1,43 @@ + + + + +Workspace Sizing Notebook - Databricks + + + + + + + + + + + + + + + + + + From a72f2b01c0d9ee964f1f6a5403a3c2829f1b7837 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Wed, 21 Dec 2022 13:18:59 -0500 Subject: [PATCH 005/111] consistency edits --- convert_all_logs.py | 4 ++-- utils/create_workspace.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index a8f1e942..30504082 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -61,14 +61,14 @@ def main(checkpoint): # shared try: shared_df = util.create_shared_logs(checkpoint, directory_name = "artifacts/Shared") - util.save_to_csv(shared_df, 'shared_logs.csv') + util.save_to_csv(shared_df, 'global_shared_logs.csv') except: print("Error while trying to read shared directory. Skipping...") # other artificats try: other_df = util.create_other_artifacts(checkpoint, directory_name = "artifacts") - util.save_to_csv(other_df, "top_level_artifacts.csv") + util.save_to_csv(other_df, "global_logs.csv") except: print("Error while trying to read global artifacts. Skipping...") diff --git a/utils/create_workspace.py b/utils/create_workspace.py index 97c25e84..e22ac904 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -106,6 +106,10 @@ def split_csv(self, module, module_function, sheet_name): #pushing all errors to a csv if 'errors' not in self.new_path: os.mkdir(self.new_path + 'errors') - pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name + '.csv') + + if len(errors[0]) > 0: + print(f"{datetime.now()} There are errors. Please review error logs for {module}") + + pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name + '.csv') # success should be 0 return 0 From 23cf07f2a491c9c939d31a351894729769a7c63f Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 23 Jan 2023 09:57:43 -0500 Subject: [PATCH 006/111] adds notification if Job Owner is dropped. --- utils/split_logs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 09c438b8..aea1740b 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -38,7 +38,7 @@ def write_logs(self, log, file_name): for l in log: f.write(json.dumps(l) + '\n') - def fix_acls(self, acls): + def fix_acls(self, acls, jobs=False): new_acls = [] for permission in acls: if 'group_name' in permission.keys(): @@ -47,6 +47,11 @@ def fix_acls(self, acls): if 'user_name' in permission.keys(): if permission['user_name'] in self.imported_users: new_acls.append(permission) + else: + # user will get dropped + if jobs: + if permission['permission_level'] == 'IS_OWNER': + print(f"Dropping Job Owner {permission['user_name']} from job. Add Job Owner to acl_jobs.log") if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: new_acls.append(permission) @@ -54,6 +59,8 @@ def fix_acls(self, acls): if permission['display'] in self.imported_users: new_acls.append(permission) + + return new_acls def users(self, df, file_name="users.log"): @@ -232,6 +239,7 @@ def acl_jobs(self, df, file_name="acl_jobs.log"): jobid = d['object_id'].split("/")[-1] if int(jobid) in df['job_ids'].tolist(): data_write.append(d) + print(f"Editing Job with Job ID: {jobid}") if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) except Exception as e: From 8a89b143599b095cad2b0384cd658f66daf9797b Mon Sep 17 00:00:00 2001 From: Allistair Cota Date: Thu, 9 Mar 2023 16:38:07 -0500 Subject: [PATCH 007/111] fixes for secret scopes, shared folders, and global logs --- utils/split_logs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index aea1740b..6fe4820f 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -103,7 +103,7 @@ def instance_pools(self, df, file_name="instance_pools.log"): def secret_scopes(self, df, file_name=None): - scopes = df["secret_scope_names"] + scopes = df["secret_scopes"] errors = {'Data':[], 'Error':[]} for scope in scopes: try: @@ -387,7 +387,7 @@ def shared_notebooks(self, df, file_name=None): try: if "artifacts" not in os.listdir(self.new_path): os.mkdir(self.new_path+'artifacts') - if "Shared" not in os.listdir(self.new_path+"artifacts/Shared/"): + if "Shared" not in os.listdir(self.new_path+"artifacts/"): os.mkdir(self.new_path+'artifacts/Shared/') new_folder_path = self.new_path+'artifacts/Shared/'+notebook src_path = self.path+'artifacts/Shared/'+notebook @@ -398,7 +398,7 @@ def shared_notebooks(self, df, file_name=None): return errors def global_notebooks(self, df, file_name=None): - names = df['global_shared_folder_names'] + names = df['global_folder_names'] errors = {'Data':[], 'Error':[]} for notebook in names: try: From 9e88271e3edec61b062a6ec7cf183d9ecebeee56 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Fri, 10 Mar 2023 15:20:34 -0500 Subject: [PATCH 008/111] adding functionality to also split database_details.log --- utils/create_workspace.py | 7 ++++--- utils/split_logs.py | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/utils/create_workspace.py b/utils/create_workspace.py index e22ac904..d5d96701 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -38,7 +38,8 @@ def __init__(self, checkpoint, workspace, all_workspaces): 'acl_directories':["users", split.acl_directories], 'metastore': ["metastore", split.metastore], 'success_metastore': ["metastore", split.success_metastore], - 'table_acls':["metastore", split.table_acls] + 'table_acls':["metastore", split.table_acls], + "database_details": ["metastore", split.database_details] } print("-"*80) print(f"CREATING WORKSPACE {workspace}...") @@ -59,7 +60,7 @@ def copy_other_files(self): """ summary: copy files that need to be copied to all workspace folders """ - total = ['app_logs', 'checkpoint', 'database_details.log', 'source_info.txt'] + total = ['app_logs', 'checkpoint', 'source_info.txt'] for w in self.workspaces: # don't copy the logs that were not in the csvs directory total_in_workspace = os.listdir("./logs/"+self.checkpoint+"_"+w) @@ -109,7 +110,7 @@ def split_csv(self, module, module_function, sheet_name): if len(errors[0]) > 0: print(f"{datetime.now()} There are errors. Please review error logs for {module}") - + pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name + '.csv') # success should be 0 return 0 diff --git a/utils/split_logs.py b/utils/split_logs.py index aea1740b..3c3923a4 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -528,7 +528,26 @@ def success_metastore(self, df, file_name='success_metastore.log'): d = d.strip() d = json.loads(d) database = d['table'].split(".")[0] - if len(df.loc[(df['metastore_database'] == database)]) > 0: + if database in df['metastore_database'].tolist(): + data_write.append(d) + except Exception as e: + errors['Data'].append(d) + errors['Error'].append(e) + self.write_logs(data_write, file_name) + return 0 + + def database_details(self, df, file_name="database_details.log"): + data = self.read_log(file_name) + data_write = [] + errors = {'Data':[], 'Error':[]} + + for d in data: + try: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + database = d['Namespace Name'] + if database in df['metastore_database'].tolist(): data_write.append(d) except Exception as e: errors['Data'].append(d) From 05f7c950985f1e0a7d74f3071c7617f175d8a3d1 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 17 Mar 2023 10:25:05 -0400 Subject: [PATCH 009/111] fix to secret scope acl split --- utils/split_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 7147a48c..8dcd4e6b 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -126,7 +126,7 @@ def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"): if len(d) != 0: d = d.strip() d = json.loads(d) - if d['scope_name'] in df['secret_scope_names'].tolist(): + if d['scope_name'] in df['secret_scopes'].tolist(): data_write.append(d) if "items" in d.keys(): d['items'] = self.fix_acls(d['items']) From 6ce0947aeb26c210d7a84b01cf1ae9431cd87b16 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 23 Mar 2023 08:15:46 -0400 Subject: [PATCH 010/111] fixing groups split issue --- utils/split_logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 7147a48c..27704cc7 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -55,8 +55,8 @@ def fix_acls(self, acls, jobs=False): if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: new_acls.append(permission) - if 'display' in permission.keys(): - if permission['display'] in self.imported_users: + if 'userName' in permission.keys(): + if permission['userName'] in self.imported_users: new_acls.append(permission) From 78ed463e090d3db75aaf01a548e6e1bea5c40208 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 23 Mar 2023 10:26:08 -0400 Subject: [PATCH 011/111] adjusting file names --- utils/split_logs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index dd24c066..7484551a 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -127,9 +127,11 @@ def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"): d = d.strip() d = json.loads(d) if d['scope_name'] in df['secret_scopes'].tolist(): - data_write.append(d) - if "items" in d.keys(): + print(d['items']) d['items'] = self.fix_acls(d['items']) + print(d['items']) + data_write.append(d) + except Exception as e: errors['Data'].append(d) errors['Error'].append(e) From 27deb6766104a2b84acd37e44a55ca867b4c38b7 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 24 Mar 2023 18:03:27 -0400 Subject: [PATCH 012/111] fixing ACLs issue for groups --- utils/split_logs.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 7484551a..1578e52c 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -58,9 +58,9 @@ def fix_acls(self, acls, jobs=False): if 'userName' in permission.keys(): if permission['userName'] in self.imported_users: new_acls.append(permission) - - - + if 'display' in permission.keys(): + if permission['display'] in self.imported_groups: + new_acls.append(permission) return new_acls def users(self, df, file_name="users.log"): @@ -79,7 +79,7 @@ def users(self, df, file_name="users.log"): except Exception as e: errors['Data'].append(d) errors['Error'].append(e) - + print(self.imported_users) self.write_logs(data_write, file_name) return errors @@ -127,11 +127,8 @@ def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"): d = d.strip() d = json.loads(d) if d['scope_name'] in df['secret_scopes'].tolist(): - print(d['items']) d['items'] = self.fix_acls(d['items']) - print(d['items']) data_write.append(d) - except Exception as e: errors['Data'].append(d) errors['Error'].append(e) @@ -183,9 +180,9 @@ def acl_clusters(self, df, file_name = "acl_clusters.log"): d = json.loads(d) cluster = d['object_id'].split("/")[-1] if cluster in df['cluster_id'].tolist(): + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) - if "access_control_list" in d.keys(): - d['access_control_list'] = self.fix_acls(d['access_control_list']) except Exception as e: errors['Data'].append(d) errors['Error'].append(e) @@ -240,10 +237,11 @@ def acl_jobs(self, df, file_name="acl_jobs.log"): d = json.loads(d) jobid = d['object_id'].split("/")[-1] if int(jobid) in df['job_ids'].tolist(): + print(f"Editing Job with Job ID: {jobid}") + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) - print(f"Editing Job with Job ID: {jobid}") - if "access_control_list" in d.keys(): - d['access_control_list'] = self.fix_acls(d['access_control_list']) + except Exception as e: errors['Data'].append(d) errors['Error'].append(e) @@ -289,6 +287,7 @@ def mounts(self, df, file_name='mounts.log'): def groups(self, df, file_name=None): groups = df['group_name'] errors = {'Data':[], 'Error':[]} + self.imported_groups = groups.tolist() for group in groups: try: @@ -310,8 +309,6 @@ def groups(self, df, file_name=None): except Exception as e: errors['Data'].append(group) errors['Error'].append(e) - all_groups = os.listdir(self.path + "groups") - self.imported_groups = [g for g in all_groups if g in groups ] return errors def user_dirs(self, df=None, file_name="user_dirs.log"): @@ -457,9 +454,9 @@ def acl_notebooks(self, df, file_name="acl_notebooks.log"): d = json.loads(d) path = str(d['path']) if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) - if "access_control_list" in d.keys(): - d['access_control_list'] = self.fix_acls(d['access_control_list']) except Exception as e: errors['Data'].append(d) errors['Error'].append(e) @@ -493,9 +490,9 @@ def acl_directories(self, df, file_name="acl_directories.log"): d = json.loads(d) path = str(d['path']) if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if "access_control_list" in d.keys(): + d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) - if "access_control_list" in d.keys(): - d['access_control_list'] = self.fix_acls(d['access_control_list']) except Exception as e: errors['Data'].append(d) errors['Error'].append(e) From bfc70ff09194e53199f8d6107ad5c40db86b60a3 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 27 Mar 2023 13:07:47 -0400 Subject: [PATCH 013/111] fixing Shared + artifacts ACLs split --- convert_all_logs.py | 2 +- utils/split_logs.py | 72 ++++++++++++++++++++++----------------------- utils/to_csv.py | 2 +- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index 30504082..f0658bab 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -85,7 +85,7 @@ def main(checkpoint): scopes_df = util.create_scopes(checkpoint, directory_name = 'secret_scopes') util.save_to_csv(scopes_df, "secret_scopes.csv") except: - prnit("Error while trying to read secrets. Skipping...") + print("Error while trying to read secrets. Skipping...") # metastore try: diff --git a/utils/split_logs.py b/utils/split_logs.py index 7484551a..6714e19f 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -3,6 +3,7 @@ import shutil import pandas as pd import gzip +from datetime import datetime class Split(): def __init__(self, checkpoint, workspace): @@ -21,9 +22,9 @@ def read_log(self, file_name): data = f.read().split("\n") return data except FileNotFoundError as e: - return print(f"File {file_name} not found. ") + return print(f"{datetime.now()} Error: {file_name} not found. ") except Exception as e: - print(f"There was an error while reading {file_name}. ") + print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ") #print(e) return '' @@ -51,7 +52,7 @@ def fix_acls(self, acls, jobs=False): # user will get dropped if jobs: if permission['permission_level'] == 'IS_OWNER': - print(f"Dropping Job Owner {permission['user_name']} from job. Add Job Owner to acl_jobs.log") + print(f"{datetime.now()} Dropping Job Owner {permission['user_name']} from job. Add Job Owner to acl_jobs.log") if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: new_acls.append(permission) @@ -127,9 +128,7 @@ def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"): d = d.strip() d = json.loads(d) if d['scope_name'] in df['secret_scopes'].tolist(): - print(d['items']) d['items'] = self.fix_acls(d['items']) - print(d['items']) data_write.append(d) except Exception as e: @@ -241,7 +240,7 @@ def acl_jobs(self, df, file_name="acl_jobs.log"): jobid = d['object_id'].split("/")[-1] if int(jobid) in df['job_ids'].tolist(): data_write.append(d) - print(f"Editing Job with Job ID: {jobid}") + print(f"{datetime.now()} - Editing Job with Job ID: {jobid}") if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) except Exception as e: @@ -294,8 +293,6 @@ def groups(self, df, file_name=None): try: if "groups" not in os.listdir(self.new_path): os.mkdir(self.new_path + "groups/") - new_file_path = self.new_path + "groups/" - src_path = self.path + "groups/" + group group_data = self.read_log("groups/" + group) group_data_write = [] @@ -317,18 +314,19 @@ def groups(self, df, file_name=None): def user_dirs(self, df=None, file_name="user_dirs.log"): data_user = df user_names = data_user['userName'].tolist() - if "global_shared_logs" in os.listdir("./csv/"): - data_art = pd.read_csv('./csv/global_shared_logs.csv', index_col=0) - art_names = data_art['global_shared_folder_names'].tolist() - else: + try: + data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + art_names = data_art['global_folder_names'].tolist() + except: data_art = [] art_names = [] - if "shared_logs" in os.listdir("./csv/"): - data_shared = pd.read_csv('./csv/shared_logs.csv', index_col=0) + try: + data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs") shared_names = data_shared['notebook_names'].tolist() - else: + except: data_shared = [] shared_names = [] + data = self.read_log(file_name) user_paths=['/Users/'+ n for n in user_names] shared_paths=['/Shared/'+ n for n in shared_names] @@ -352,16 +350,16 @@ def user_workspace(self, df, file_name="user_workspace.log"): data_user = df user_names = data_user['userName'].tolist() - if "global_shared_logs" in os.listdir("./csv/"): - data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) - art_names = data_art['global_shared_folder_names'].tolist() - else: + try: + data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + art_names = data_art['global_folder_names'].tolist() + except: data_art = [] art_names = [] - if "shared_logs" in os.listdir("./csv/"): - data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + try: + data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs") shared_names = data_shared['notebook_names'].tolist() - else: + except: data_shared = [] shared_names = [] data = self.read_log(file_name) @@ -434,18 +432,19 @@ def user_notebooks(self, df, file_name=None): def acl_notebooks(self, df, file_name="acl_notebooks.log"): data_user = df user_names = data_user['userName'].tolist() - if "global_shared_logs" in os.listdir("./csv/"): - data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) - art_names = data_art['global_shared_folder_names'].tolist() - else: + try: + data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + art_names = data_art['global_folder_names'].tolist() + except: data_art = [] art_names = [] - if "shared_logs" in os.listdir("./csv/"): - data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + try: + data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs") shared_names = data_shared['notebook_names'].tolist() - else: + except: data_shared = [] shared_names = [] + data = self.read_log(file_name) user_paths=['/Users/'+ n for n in user_names] shared_paths=['/Shared/'+ n for n in shared_names] @@ -469,18 +468,19 @@ def acl_notebooks(self, df, file_name="acl_notebooks.log"): def acl_directories(self, df, file_name="acl_directories.log"): data_user = df user_names = data_user['userName'].tolist() - if "global_shared_logs" in os.listdir("./csv/"): - data_art = pd.read_csv('csv/global_shared_logs.csv', index_col=0) - art_names = data_art['global_shared_folder_names'].tolist() - else: + try: + data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + art_names = data_art['global_folder_names'].tolist() + except: data_art = [] art_names = [] - if "shared_logs" in os.listdir("./csv/"): - data_shared = pd.read_csv('csv/shared_logs.csv', index_col=0) + try: + data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs") shared_names = data_shared['notebook_names'].tolist() - else: + except: data_shared = [] shared_names = [] + data = self.read_log(file_name) user_paths=['/Users/'+ n for n in user_names] shared_paths=['/Shared/'+ n for n in shared_names] diff --git a/utils/to_csv.py b/utils/to_csv.py index 5523ac61..6fad7fa5 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -170,7 +170,7 @@ def create_jobs(data): return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_owners } -def create_shared_logs(checkpoint = "", directory_name = "artifacts/shared"): +def create_shared_logs(checkpoint = "", directory_name = "artifacts/Shared"): shared_path = f"./logs/{checkpoint}/{directory_name}" notebooks = os.listdir(shared_path) From d646e16b8fb7b71e95b3523dbe9db7366cd524a8 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 28 Mar 2023 10:47:01 -0400 Subject: [PATCH 014/111] adding + fixing error logging capabilities --- utils/create_workspace.py | 17 +++++++++-------- utils/split_logs.py | 14 +++++++++----- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/utils/create_workspace.py b/utils/create_workspace.py index d5d96701..b48e3e4e 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -1,6 +1,5 @@ from utils.split_logs import Split import os -import json import shutil import pandas as pd from datetime import datetime @@ -83,7 +82,6 @@ def run(self): """ # for each for m in self.map.keys(): - print(f"{datetime.now()} Starting to split {m}.") try: # get the asset function that splits that asset module_function = self.map[m][1] @@ -91,6 +89,10 @@ def run(self): sheet = self.map[m][0] # split_csv performs the actual split and outputs all csvs that were not in the csv directory success = self.split_csv(m, module_function, sheet) + if success==0: + print(f"{datetime.now()} Successfully split {m}.") + else: + print(f"{datetime.now()} Error during split {m}.") except Exception as e: pass return 0 @@ -100,17 +102,16 @@ def split_csv(self, module, module_function, sheet_name): # you can set that variable to True or 1 or anything else that the client is using # but it will ignore anything else df = pd.read_excel("asset_mapping.xlsx", sheet_name = sheet_name) - #df = pd.read_csv("./csv/"+csv, index_col=0) current_df = df[df[self.workspace] == "Y"] # send that subset dataframe to the module function found in Split class errors = module_function(current_df.reset_index()) #pushing all errors to a csv - if 'errors' not in self.new_path: + if 'errors' not in os.listdir(self.new_path): os.mkdir(self.new_path + 'errors') - - if len(errors[0]) > 0: - print(f"{datetime.now()} There are errors. Please review error logs for {module}") - pd.DataFrame(errors).to_csv(self.new_path + 'errors/' + sheet_name + '.csv') + er = pd.DataFrame(errors) + if len(er) > 0: + print(f"{datetime.now()} There are {len(er)} errors. Please review error logs for {module}") + er.to_csv(self.new_path + 'errors/' + module + '.csv') # success should be 0 return 0 diff --git a/utils/split_logs.py b/utils/split_logs.py index 5274c303..8dcba6eb 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -51,8 +51,9 @@ def fix_acls(self, acls, jobs=False): else: # user will get dropped if jobs: - if permission['permission_level'] == 'IS_OWNER': - print(f"{datetime.now()} Dropping Job Owner {permission['user_name']} from job. Add Job Owner to acl_jobs.log") + if permission['all_permissions'][0]['permission_level'] == 'IS_OWNER': + print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") + return 0 if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: new_acls.append(permission) @@ -80,7 +81,6 @@ def users(self, df, file_name="users.log"): except Exception as e: errors['Data'].append(d) errors['Error'].append(e) - print(self.imported_users) self.write_logs(data_write, file_name) return errors @@ -238,9 +238,13 @@ def acl_jobs(self, df, file_name="acl_jobs.log"): d = json.loads(d) jobid = d['object_id'].split("/")[-1] if int(jobid) in df['job_ids'].tolist(): - print(f"{datetime.now()} - Editing Job with Job ID: {jobid}") + # print(f"{datetime.now()} - Editing Job with Job ID: {jobid}") if "access_control_list" in d.keys(): - d['access_control_list'] = self.fix_acls(d['access_control_list']) + d['access_control_list'] = self.fix_acls(d['access_control_list'], jobs=True) + if d['access_control_list'] == 0: + errors['Data'].append(jobid) + errors['Error'].append("Job Owner is not tagged in the asset mapping.") + continue data_write.append(d) except Exception as e: errors['Data'].append(d) From 5181fce1db2ca9663c41f49117abb6f54d011443 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 28 Mar 2023 10:57:43 -0400 Subject: [PATCH 015/111] rephrasing outputs. grammatical issues --- utils/create_workspace.py | 10 +++++----- utils/split_logs.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/utils/create_workspace.py b/utils/create_workspace.py index b48e3e4e..a452360c 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -88,13 +88,13 @@ def run(self): # get the appropriate csv that matches it sheet = self.map[m][0] # split_csv performs the actual split and outputs all csvs that were not in the csv directory + print(f"{datetime.now()} Working on {m}...") success = self.split_csv(m, module_function, sheet) - if success==0: - print(f"{datetime.now()} Successfully split {m}.") - else: - print(f"{datetime.now()} Error during split {m}.") + except Exception as e: pass + + print(f"{datetime.now()} Please review error logs in the {self.new_path}errors/ directory to confirm successful split. ") return 0 def split_csv(self, module, module_function, sheet_name): @@ -111,7 +111,7 @@ def split_csv(self, module, module_function, sheet_name): er = pd.DataFrame(errors) if len(er) > 0: - print(f"{datetime.now()} There are {len(er)} errors. Please review error logs for {module}") + print(f"{datetime.now()} WARNING: There are {len(er)} errors. Please review error logs for {module}") er.to_csv(self.new_path + 'errors/' + module + '.csv') # success should be 0 return 0 diff --git a/utils/split_logs.py b/utils/split_logs.py index 8dcba6eb..5d7cc62e 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -22,9 +22,9 @@ def read_log(self, file_name): data = f.read().split("\n") return data except FileNotFoundError as e: - return print(f"{datetime.now()} Error: {file_name} not found. ") + return print(f"{datetime.now()} Error: {file_name} not found. ") except Exception as e: - print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ") + print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ") #print(e) return '' @@ -52,7 +52,7 @@ def fix_acls(self, acls, jobs=False): # user will get dropped if jobs: if permission['all_permissions'][0]['permission_level'] == 'IS_OWNER': - print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") + # print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") return 0 if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: From caf44197fb2400c91c52f447f21e395f370fb978 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 28 Mar 2023 11:40:56 -0400 Subject: [PATCH 016/111] adding default job owner capability --- map.py | 7 ++++--- utils/create_workspace.py | 4 ++-- utils/split_logs.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/map.py b/map.py index 5b17e69d..d61da16b 100644 --- a/map.py +++ b/map.py @@ -8,12 +8,13 @@ def main(): # takes two arguments: checkpoint and workspaces all_args = argparse.ArgumentParser() all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export") - all_args.add_argument("--workspaces", dest="workspaces", nargs="+", required=True, help="list of workspace names. must match csv columns") - + all_args.add_argument("--workspaces", dest="workspaces", nargs="+", required=True, help="list of workspace names. must match columns in asset_mapping.xslx.") + all_args.add_argument('--default-job-owner', dest="default_job_owner", default=False, help="set if you want to add a job owner to jobs that drop untagged owners.") args = all_args.parse_args() checkpoint = args.checkpoint workspaces = args.workspaces + default_owner = args.default_job_owner # for each workspace @@ -22,7 +23,7 @@ def main(): # this instantiates the original location of the session and the new location of the session # it also instantiates another class Split - refer to split_logs.py # Split instantiates the same thing as well as two variables: imported users and imported groups (this is used for remaking ACLs) - workspace = Workspace(checkpoint, w, workspaces) + workspace = Workspace(checkpoint, w, workspaces, default_owner) success = workspace.run() workspace.copy_other_files() diff --git a/utils/create_workspace.py b/utils/create_workspace.py index a452360c..3f7928e4 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -5,13 +5,13 @@ from datetime import datetime class Workspace(): - def __init__(self, checkpoint, workspace, all_workspaces): + def __init__(self, checkpoint, workspace, all_workspaces, default_owner=False): self.path = "./logs/"+checkpoint+"/" self.workspace = str(workspace) self.new_path = "./logs/"+checkpoint+"_"+workspace+"/" self.workspaces = all_workspaces self.checkpoint = checkpoint - split = Split(checkpoint, workspace) + split = Split(checkpoint, workspace, default_owner) # this is where all assets are mapped to what csv they refer to + what function they use for the split self.map = { diff --git a/utils/split_logs.py b/utils/split_logs.py index 5d7cc62e..17c46233 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -6,12 +6,13 @@ from datetime import datetime class Split(): - def __init__(self, checkpoint, workspace): + def __init__(self, checkpoint, workspace, default_owner=False): self.path = "./logs/"+checkpoint+"/" self.workspace = workspace self.new_path = "./logs/"+checkpoint+"_"+workspace+"/" self.imported_users = [] self.imported_groups = ['admins', 'Users'] + self.default_job_owner = default_owner def read_log(self, file_name): """ @@ -52,8 +53,12 @@ def fix_acls(self, acls, jobs=False): # user will get dropped if jobs: if permission['all_permissions'][0]['permission_level'] == 'IS_OWNER': - # print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") - return 0 + if self.default_job_owner: + default_permission = {"user_name": self.default_job_owner, "all_permissions": [{"permission_level": "IS_OWNER", "inherited": False}]} + new_acls.append(default_permission) + else: + # print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") + return 0 if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: new_acls.append(permission) From 1322074d9f78c1752e43453d97c0871bf5b8facc Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 28 Mar 2023 14:32:23 -0400 Subject: [PATCH 017/111] fixing dir acl issue --- utils/split_logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 17c46233..6e386675 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -462,7 +462,7 @@ def acl_notebooks(self, df, file_name="acl_notebooks.log"): try: d = json.loads(d) path = str(d['path']) - if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if (path.startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) @@ -499,7 +499,7 @@ def acl_directories(self, df, file_name="acl_directories.log"): try: d = json.loads(d) path = str(d['path']) - if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if (path.startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) From b462bfc2c36b7acc85252b4c40c85fab90cf457e Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 28 Mar 2023 15:01:41 -0400 Subject: [PATCH 018/111] revert last commit --- utils/split_logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 6e386675..17c46233 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -462,7 +462,7 @@ def acl_notebooks(self, df, file_name="acl_notebooks.log"): try: d = json.loads(d) path = str(d['path']) - if (path.startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) @@ -499,7 +499,7 @@ def acl_directories(self, df, file_name="acl_directories.log"): try: d = json.loads(d) path = str(d['path']) - if (path.startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): + if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))): if "access_control_list" in d.keys(): d['access_control_list'] = self.fix_acls(d['access_control_list']) data_write.append(d) From 1c90ee366664a61fc18454ab2d479f5480dc5446 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 28 Mar 2023 15:37:39 -0400 Subject: [PATCH 019/111] fixing syntax error --- utils/split_logs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 17c46233..112bc227 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -440,9 +440,10 @@ def acl_notebooks(self, df, file_name="acl_notebooks.log"): data_user = df user_names = data_user['userName'].tolist() try: - data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") art_names = data_art['global_folder_names'].tolist() - except: + except Exception as e: + print(e) data_art = [] art_names = [] try: @@ -476,7 +477,7 @@ def acl_directories(self, df, file_name="acl_directories.log"): data_user = df user_names = data_user['userName'].tolist() try: - data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") + data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs") art_names = data_art['global_folder_names'].tolist() except: data_art = [] From cecf55e696c3ce9269ad2839e9fbf5c22f7dc2aa Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 30 Mar 2023 10:42:51 -0400 Subject: [PATCH 020/111] adding library migration script; content created by Tejas --- library_migration.py | 168 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 library_migration.py diff --git a/library_migration.py b/library_migration.py new file mode 100644 index 00000000..af07a42d --- /dev/null +++ b/library_migration.py @@ -0,0 +1,168 @@ +import json +import argparse +import requests +from datetime import datetime +import configparser +import re +import os +from os import path + +class dbclient: + def __init__(self, profile): + login = self.get_login_credentials(profile) + url = login['host'] + token = login['token'] + self.url = self.url_validation(url) + self.token = token + + def url_validation(self, url): + if '/?o=' in url: + # if the workspace_id exists, lets remove it from the URL + url = re.sub("\/\?o=.*", '', url) + elif 'net/' == url[-4:]: + url = url[:-1] + elif 'com/' == url[-4:]: + url = url[:-1] + return url.rstrip("/") + + def get_login_credentials(self, profile='DEFAULT'): + creds_path = '~/.databrickscfg' + config = configparser.ConfigParser() + abs_creds_path = path.expanduser(creds_path) + config.read(abs_creds_path) + try: + current_profile = dict(config[profile]) + if not current_profile: + raise ValueError(f"Unable to find a defined profile to run this tool. Profile \'{profile}\' not found.") + return current_profile + except KeyError: + raise ValueError( + 'Unable to find credentials to load for profile. Profile only supports tokens.') + + def get_url_token(self): + return self.url, self.token + +def get_libraries_cluster(token, workspace_url, cluster_id): + url = f"{workspace_url}/api/2.0/libraries/cluster-status" + print(f"{datetime.now()} Endpoint: {url}") + print(f"{datetime.now()} Getting list of libraries from clusters... ") + st_response = requests.get(url, headers = {"Authentication": f"Bearer {token}"}, json = {"cluster_id": cluster_id}) + + if st_response.status_code != 200: + print(f"{datetime.now()} ERROR. ") + print(st_response.content) + return '' + else: + st_statuses = st_response.json() + return st_statuses + +def get_cluster_name(token, workspace_url): + url = f"{workspace_url}/api/2.0/clusters/list" + print(f"{datetime.now()} Endpoint: {url}") + print(f"{datetime.now()} Getting list of clusters from {workspace_url}... ") + + response = requests.get(url, headers = {"Authentication": f"Bearer {token}"}) + + if response.status_code != 200: + print(f"{datetime.now()} ERROR. ") + raise Exception(response.content) + else: + return response.json() + +# Find ST cluster_name from the ST cluster_id +def find_cluster_name(cluster_id, json_list): + for i in json_list: + if cluster_id == i['cluster_id']: + return i['cluster_name'] + return '' +# Find E2 cluster id using the cluster_name +def find_cluster_id(cluster_name, json_list): + for i in json_list: + if cluster_name == i['cluster_name']: + return i['cluster_id'] + return '' + +def export_pipeline(old_profile, new_profile): + old_dbclient = dbclient(profile=old_profile) + old_url, old_token = old_dbclient.get_url_token() + + st_clusters = get_cluster_name(old_token, old_url) + + new_dbclient = dbclient(profile=new_profile) + new_url, new_token = new_dbclient.get_url_token() + + e2_clusters = get_cluster_name(new_token, new_url) + + st_clusters['clusters'] = [i for i in st_clusters['clusters'] if 'JOB' not in i['cluster_source']] + e2_clusters['clusters'] = [i for i in e2_clusters['clusters'] if 'JOB' not in i['cluster_source']] + + st_statuses = [] + for i in st_clusters['clusters']: + st_statuses.append(get_libraries_cluster(old_token, old_url, i['cluster_id'])) + + no_libraries = [] + with_libraries = [] + for i in st_statuses: + try: + st_cname = find_cluster_name(i['cluster_id'], st_clusters['clusters']) + if st_cname != '': + e2_cid = find_cluster_id(st_cname, e2_clusters['clusters']) + if e2_cid != '': + print(f"{datetime.now()} Creating Cluster ID Mapping... ") + print(f"{' '*26} Cluster Name: {st_cname} {i['cluster_id']} -> {e2_cid}") + i['cluster_id'] = e2_cid + with_libraries.append({ + 'cluster_id': e2_cid, + 'libraries': [j['library'] for j in i['library_statuses']] + }) + else: + print(f"{datetime.now()} Error: Cannot find the cluster {st_cname} in new workspace") + else: + print(f"{datetime.now()} Error: Cannot find the cluster id {i['cluster_id']} in the original workspace") + except Exception as e: + no_libraries.append(i['cluster_id']) + + return with_libraries, no_libraries + +def install_library(token, workspace_url, data): + library_install_url = f"{workspace_url}/api/2.0/libraries/install" + print(f"{datetime.now()} Endpoint: {library_install_url}") + print(f"{datetime.now()} Installing libraries on new clusters... ") + + for i in data: + response = requests.post(library_install_url, headers = {"Authentication": f"Bearer {token}"}, json=i) + + if response.status_code == 200: + print(f"{datetime.now()} Successfully added libraries for", i['cluster_id']) + else: + print(f"{datetime.now()} Error: Cannot add libraries for", i['cluster_id']) + print(response.content) + +def import_pipeline(new_profile, data): + new_dbclient = dbclient(profile=new_profile) + new_url, new_token = new_dbclient.get_url_token() + install_library(new_token, new_url, data) + return + + +def main(): + all_args = argparse.ArgumentParser() + all_args.add_argument('--old-profile', dest="old", help="Profile of the old workspace. ") + all_args.add_argument('--new-profile', dest="new", help="Profile of the new workspace. ") + args = all_args.parse_args() + + old_dbclient = args.old + new_dbclient = args.new + + print(f"{datetime.now()} EXPORTING LIBRARIES... ") + libraries_data, no_libraries = export_pipeline(old_dbclient, new_dbclient) + print() + confirm = input(f"Import from? (y/N) ") + if confirm.lower() in ["y", "yes"]: + print(f"{datetime.now()} IMPORTING LIBRARIES... ") + import_pipeline(new_dbclient, libraries_data) + else: + print(f"{datetime.now()} EXITING PIPELINE... ") + +if __name__ == "__main__": + main() From 32c3a6fde31adaf0b2daa7b0803eca0d85e4b598 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 30 Mar 2023 10:49:44 -0400 Subject: [PATCH 021/111] adding logging fun stuff --- utils/split_logs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 112bc227..0baa444b 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -57,7 +57,6 @@ def fix_acls(self, acls, jobs=False): default_permission = {"user_name": self.default_job_owner, "all_permissions": [{"permission_level": "IS_OWNER", "inherited": False}]} new_acls.append(default_permission) else: - # print(f"{datetime.now()} The user {permission['user_name']} owns a job. This job will not be added to the split log. Please change the owner or add the user in the asset mapping.") return 0 if 'principal' in permission.keys(): if permission['principal'] in self.imported_users: From f550f70e21ae4d45d7c28ac75b239b0677988144 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 6 Apr 2023 10:41:55 -0400 Subject: [PATCH 022/111] Update default cluster name to E2_Migration --- data/aws_cluster.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/aws_cluster.json b/data/aws_cluster.json index 6ad6a203..012abc32 100644 --- a/data/aws_cluster.json +++ b/data/aws_cluster.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration", "spark_version": "10.4.x-scala2.12", "aws_attributes": { "first_on_demand": 1, From 0d0c95d562d2bb0e8a45b5fd7d075835fe51edb8 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 6 Apr 2023 10:43:08 -0400 Subject: [PATCH 023/111] Update default cluster name, remove spark configs Spark configurations that changed the metastore version previously are now removed --- data/aws_cluster_table_acls.json | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/data/aws_cluster_table_acls.json b/data/aws_cluster_table_acls.json index b3f521e1..32113a54 100644 --- a/data/aws_cluster_table_acls.json +++ b/data/aws_cluster_table_acls.json @@ -1,13 +1,9 @@ { "num_workers": 1, - "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration", "spark_version": "10.4.x-scala2.12", "spark_conf": { - "spark.databricks.cluster.profile": "serverless", - "spark.databricks.repl.allowedLanguages": "python,sql", "spark.databricks.acl.dfAclsEnabled": "true", - "spark.sql.hive.metastore.version": "1.2.1", - "spark.sql.hive.metastore.jars": "maven" }, "aws_attributes": { "first_on_demand": 1, From b16961f54daf1aad12570daf027325955bda6ffd Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 6 Apr 2023 15:46:35 -0400 Subject: [PATCH 024/111] fixing parser.py to allow export_db.py to be used --- dbclient/parser.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dbclient/parser.py b/dbclient/parser.py index 9e35ddaa..d298fa26 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -232,6 +232,12 @@ def get_export_parser(): parser.add_argument('--exclude-work-item-prefixes', nargs='+', type=str, default=[], help='List of prefixes to skip export for log_all_workspace_items') + + parser.add_argument('--timeout', type=float, default=300.0, + help='Timeout for the calls to Databricks\' REST API, in seconds, defaults to 300.0 --use float e.g. 100.0 to make it bigger') + + parser.add_argument('--skip-missing-users', action='store_true', default=False, + help='Skip missing principles during import.') return parser @@ -419,7 +425,9 @@ def build_client_config(profile, url, token, args): 'verify_ssl': (not args.no_ssl_verification), 'skip_failed': args.skip_failed, 'debug': args.debug, - 'file_format': str(args.notebook_format) + 'file_format': str(args.notebook_format), + 'timeout':args.timeout, + 'skip_missing_users':args.skip_missing_users } # this option only exists during imports so we check for existence if 'overwrite_notebooks' in args: From 6aec643847b9eb1b3e2b28ba422ca1634bf8cf01 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 17 Apr 2023 11:30:04 -0400 Subject: [PATCH 025/111] new default job owner paramter to set default owner when owners are missing in legacy --- dbclient/JobsClient.py | 11 ++++++++++- dbclient/parser.py | 5 ++++- tasks/tasks.py | 4 ++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py index 5f6c5faa..2d5f728e 100644 --- a/dbclient/JobsClient.py +++ b/dbclient/JobsClient.py @@ -79,7 +79,7 @@ def update_imported_job_names(self, error_logger, checkpoint_job_configs_set): else: raise RuntimeError("Import job has failed. Refer to the previous log messages to investigate.") - def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.log', acl_file='acl_jobs.log'): + def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.log', acl_file='acl_jobs.log', default_job_owner=False): """ log all job configs and the ACLs for each job :param users_list: a list of users / emails to filter the results upon (optional for group exports) @@ -131,6 +131,15 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo for permission in acl.get("all_permissions"): if permission.get("permission_level") == "IS_OWNER": valid_acl = True + if not valid_acl and default_job_owner: + default_owner_permission = {"user_name": default_job_owner, "all_permissions": [{"permission_level": "IS_OWNER", "inherited": False}]} + acls.append(default_owner_permission) + # re check if ACL is valid + for acl in acls: + for permission in acl.get("all_permissions"): + if permission.get("permission_level") == "IS_OWNER": + valid_acl = True + if valid_acl: # job and job_acl are fine, writing both to the output files log_fp.write(json.dumps(x) + '\n') diff --git a/dbclient/parser.py b/dbclient/parser.py index c478e717..7468834e 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -497,7 +497,10 @@ def get_pipeline_parser() -> argparse.ArgumentParser: parser.add_argument('--archive-missing', action='store_true', help='Import all missing users into the top level /Archive/ directory.') - + # Jobs arguments + parser.add_argument('--default-job-owner', action='store', default=False, + help='Set a default job owner for jobs without an owner.') + # Metastore arguments parser.add_argument('--repair-metastore-tables', action='store_true', default=False, help='Repair legacy metastore tables') diff --git a/tasks/tasks.py b/tasks/tasks.py index ca0bd751..4d034e02 100644 --- a/tasks/tasks.py +++ b/tasks/tasks.py @@ -258,9 +258,9 @@ def run(self): jobs_c = JobsClient(self.client_config, self.checkpoint_service) if self.client_config.get("groups_to_keep"): - jobs_c.log_job_configs(groups_list=self.client_config.get("groups_to_keep")) + jobs_c.log_job_configs(groups_list=self.client_config.get("groups_to_keep"), default_job_owner=self.args.default_job_owner) else: - jobs_c.log_job_configs() + jobs_c.log_job_configs(default_job_owner=self.args.default_job_owner) class JobsImportTask(AbstractTask): From d19c3b983cc70a7fdb2aee99558a5c2e21aaf01c Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 24 Apr 2023 15:57:58 -0400 Subject: [PATCH 026/111] will update secret_scopes_acls for updating emails --- dbclient/dbclient.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbclient/dbclient.py b/dbclient/dbclient.py index b9f59952..a6c86e31 100644 --- a/dbclient/dbclient.py +++ b/dbclient/dbclient.py @@ -425,7 +425,8 @@ def update_email_addresses(self, old_email_address, new_email_address): logs_to_update = ['users.log', 'acl_jobs.log', 'acl_clusters.log', 'acl_cluster_policies.log', - 'acl_notebooks.log', 'acl_directories.log'] + 'acl_notebooks.log', 'acl_directories.log', + 'secret_scopes_acls.log'] for logfile in logs_to_update: if os.path.exists(log_dir + logfile): self.replace_file_contents(old_email_address, new_email_address, logfile) From 26e7614760ee68df0cc0803aa21dbaf6ab0eaecc Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 4 May 2023 13:16:26 -0400 Subject: [PATCH 027/111] Update JobsClient.py Continuous scheduled jobs will now be imported as PAUSED instead of UNPAUSED --- dbclient/JobsClient.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py index 2d5f728e..57dfcd50 100644 --- a/dbclient/JobsClient.py +++ b/dbclient/JobsClient.py @@ -218,6 +218,12 @@ def adjust_ids_for_cluster(settings): #job_settings or task_settings # set all imported jobs as paused job_schedule['pause_status'] = 'PAUSED' job_settings['schedule'] = job_schedule + job_schedule_continuous = job_settings.get("continuous", None) + if job_schedule_continuous: + # set all import jobs as paused + job_schedule_continuous['pause_status'] = "PAUSED" + job_settings['continuous'] = job_schedule_continuous + if 'format' not in job_settings or job_settings.get('format') == 'SINGLE_TASK': adjust_ids_for_cluster(job_settings) else: From 6243948b47848ca6856a00c4eb70bce7dd91b600 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 16 May 2023 09:55:29 -0400 Subject: [PATCH 028/111] Update dbclient.py updates jobs.log when updating email addrees --- dbclient/dbclient.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbclient/dbclient.py b/dbclient/dbclient.py index a6c86e31..54a10ea5 100644 --- a/dbclient/dbclient.py +++ b/dbclient/dbclient.py @@ -422,7 +422,8 @@ def update_email_addresses(self, old_email_address, new_email_address): :return: """ log_dir = self.get_export_dir() - logs_to_update = ['users.log', + logs_to_update = ['users.log', + 'jobs.log', 'acl_jobs.log', 'acl_clusters.log', 'acl_cluster_policies.log', 'acl_notebooks.log', 'acl_directories.log', From 7a6813972d2b9f17527712abb05af48f6b257a8a Mon Sep 17 00:00:00 2001 From: Sarah Cree <111320702+SarahCree@users.noreply.github.com> Date: Fri, 9 Jun 2023 08:16:44 -0600 Subject: [PATCH 029/111] Asset mapping spreadsheet updates (#8) * Update convert_all_logs.py Add acl_jobs.log to create_jobs() function to get job owner from acls * Update to_csv.py Added instance profiles for jobs and clusters, and added job owner for jobs --- convert_all_logs.py | 3 ++- utils/to_csv.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index f0658bab..664c960a 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -53,7 +53,8 @@ def main(checkpoint): # job try: jobs_data = util.read_log('jobs.log', checkpoint) - jobs_df = util.create_jobs(jobs_data) + jobs_acls = util.read_log('acl_jobs.log', checkpoint) + jobs_df = util.create_jobs(jobs_data, jobs_acls) util.save_to_csv(jobs_df, "jobs.csv") except: print("Error while trying to read jobs. Skipping...") diff --git a/utils/to_csv.py b/utils/to_csv.py index 6fad7fa5..1c4f584d 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -129,6 +129,7 @@ def create_clusters(data): cluster_name = [] creator_user_name = [] policy_id = [] + instance_profile = [] for d in data: try: @@ -140,16 +141,22 @@ def create_clusters(data): policy_id.append(d['policy_id']) else: policy_id.append(" ") + try: + instance_profile.append(d['aws_attributes']['instance_profile_arn']) + except: + instance_profile.append('') except Exception as e: print("Error in creating clusters...") - return {'cluster_id': cluster_id, 'cluster_name': cluster_name, 'creator_user_name': creator_user_name, 'policy_id': policy_id} + return {'cluster_id': cluster_id, 'cluster_name': cluster_name, 'creator_user_name': creator_user_name, 'policy_id': policy_id, 'instance_profile': instance_profile} -def create_jobs(data): +def create_jobs(data, jobs_acls): job_ids = [] job_names = [] job_types = [] + job_creators = [] job_owners = [] + instance_profile = [] for d in data: try: @@ -162,12 +169,27 @@ def create_jobs(data): except: job_types.append('N/A') try: - job_owners.append(d['creator_user_name']) + job_creators.append(d['creator_user_name']) + except: + job_creators.append('N/A') + try: + instance_profile.append(d['settings']['new_cluster']['aws_attributes']['instance_profile_arn']) except: - job_owners.append('N/A') + instance_profile.append('') except Exception as e: print("Error in creating jobs...") - return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_owners } + + for a in jobs_acls: + try: + a = json.loads(a) + for j in a['access_control_list']: + if j.get('user_name', None) != None: + if j['all_permissions'][0]['permission_level'] == 'IS_OWNER': + job_owners.append(j['user_name']) + except: + job_owners.append('') + + return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_creators, 'job_owner': job_owners, 'instance_profile': instance_profile} def create_shared_logs(checkpoint = "", directory_name = "artifacts/Shared"): From 099a4a668ec3644809f65097d5ffaa5b2b7ca8a7 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 10 Jul 2023 09:57:14 -0400 Subject: [PATCH 030/111] adding parameter tag --- map.py | 6 ++++-- utils/create_workspace.py | 9 +++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/map.py b/map.py index d61da16b..77a89e21 100644 --- a/map.py +++ b/map.py @@ -10,12 +10,14 @@ def main(): all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export") all_args.add_argument("--workspaces", dest="workspaces", nargs="+", required=True, help="list of workspace names. must match columns in asset_mapping.xslx.") all_args.add_argument('--default-job-owner', dest="default_job_owner", default=False, help="set if you want to add a job owner to jobs that drop untagged owners.") + all_args.add_argument('--tag', dest="tag", default='Y', help="tag used in asset_mapping.xslx.") + args = all_args.parse_args() checkpoint = args.checkpoint workspaces = args.workspaces default_owner = args.default_job_owner - + tag = args.tag # for each workspace for w in workspaces: @@ -23,7 +25,7 @@ def main(): # this instantiates the original location of the session and the new location of the session # it also instantiates another class Split - refer to split_logs.py # Split instantiates the same thing as well as two variables: imported users and imported groups (this is used for remaking ACLs) - workspace = Workspace(checkpoint, w, workspaces, default_owner) + workspace = Workspace(checkpoint, w, workspaces, default_owner, tag) success = workspace.run() workspace.copy_other_files() diff --git a/utils/create_workspace.py b/utils/create_workspace.py index 3f7928e4..6992ad9a 100644 --- a/utils/create_workspace.py +++ b/utils/create_workspace.py @@ -5,12 +5,13 @@ from datetime import datetime class Workspace(): - def __init__(self, checkpoint, workspace, all_workspaces, default_owner=False): + def __init__(self, checkpoint, workspace, all_workspaces, default_owner=False, tag='Y'): self.path = "./logs/"+checkpoint+"/" self.workspace = str(workspace) self.new_path = "./logs/"+checkpoint+"_"+workspace+"/" self.workspaces = all_workspaces self.checkpoint = checkpoint + self.tag = tag split = Split(checkpoint, workspace, default_owner) # this is where all assets are mapped to what csv they refer to + what function they use for the split @@ -89,7 +90,7 @@ def run(self): sheet = self.map[m][0] # split_csv performs the actual split and outputs all csvs that were not in the csv directory print(f"{datetime.now()} Working on {m}...") - success = self.split_csv(m, module_function, sheet) + success = self.split_csv(m, module_function, sheet, self.tag) except Exception as e: pass @@ -97,12 +98,12 @@ def run(self): print(f"{datetime.now()} Please review error logs in the {self.new_path}errors/ directory to confirm successful split. ") return 0 - def split_csv(self, module, module_function, sheet_name): + def split_csv(self, module, module_function, sheet_name, tag="Y"): # reads csv and inputs attribute columns where the workspace column is set to Y # you can set that variable to True or 1 or anything else that the client is using # but it will ignore anything else df = pd.read_excel("asset_mapping.xlsx", sheet_name = sheet_name) - current_df = df[df[self.workspace] == "Y"] + current_df = df[df[self.workspace] == tag] # send that subset dataframe to the module function found in Split class errors = module_function(current_df.reset_index()) #pushing all errors to a csv From 2a9604d3e9ec5b54631e8aa3d9d7241b26048dc9 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 10 Jul 2023 10:01:24 -0400 Subject: [PATCH 031/111] adding destination parameter --- convert_all_logs.py | 119 ++++++++++++++++++++------------------------ utils/to_csv.py | 108 +++++++++++++++++++++++----------------- 2 files changed, 115 insertions(+), 112 deletions(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index 664c960a..58ee834b 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -5,104 +5,91 @@ import argparse import os -def main(checkpoint): +def main(checkpoint, destination="csv"): # where you want the csv files to be located - # make the csv directory if it not there - if "csv" not in os.listdir(): - os.mkdir("./csv") + # make the csv directory if its not there + if destination not in os.listdir(): + print(f"Creating {destination}...") + os.mkdir(f"./{destination}") # users - try: - users_data = util.read_log("users.log", checkpoint) + users_data = util.read_log("users.log", checkpoint) + if users_data == 1: + print("users.log not found in checkpoint session") + else: users_df = util.create_users(users_data) - util.save_to_csv(users_df, "users.csv") - except: - print("Error while trying to read users. Skipping...") + util.save_to_csv(users_df, "users.csv", destination) # instance profiles - try: - ip_data = util.read_log("instance_profiles.log", checkpoint) + ip_data = util.read_log("instance_profiles.log", checkpoint) + if ip_data == 1: # file not found + print("instance_profiles.log not found in checkpoint session. Skipping...") + else: ip_df = util.create_instance_profiles(ip_data) - util.save_to_csv(ip_df, "instance_profiles.csv") - except: - print("Error while trying to read instance profiles. Skipping...") - - try: - ipo_data = util.read_log("instance_pools.log", checkpoint) + util.save_to_csv(ip_df, "instance_profiles.csv", destination) + + # instance pools + ipo_data = util.read_log("instance_pools.log", checkpoint) + if ipo_data == 1: #file not found + print("instance_pools.log not found in checkpoint session. Skipping...") + else: ipo_df = util.create_instance_pools(ipo_data) - util.save_to_csv(ipo_df, "instance_pools.csv") - except: - print("Error while trying to read instance pools. Skipping...") + util.save_to_csv(ipo_df, "instance_pools.csv", destination) # groups - try: - groups_df = util.create_groups(checkpoint, directory_name = "groups") - util.save_to_csv(groups_df, "groups.csv") - except: - print("Error while trying to read users. Skipping...") - + groups_df = util.create_groups("groups", checkpoint) + util.save_to_csv(groups_df, "groups.csv", destination) # clusters - try: - clusters_data = util.read_log("clusters.log", checkpoint) + clusters_data = util.read_log("clusters.log", checkpoint) + if clusters_data ==1 : #file not found + print("clusters.log not found in checkpoint session. Skipping... ") + else: clusters_df = util.create_clusters(clusters_data) - util.save_to_csv(clusters_df, "clusters.csv") - except: - print("Error while trying to read clusters. Skipping...") - + util.save_to_csv(clusters_df, "clusters.csv", destination) + # job - try: - jobs_data = util.read_log('jobs.log', checkpoint) + jobs_data = util.read_log('jobs.log', checkpoint) + if jobs_data == 1: #file not found + print("jobs.log not found in checkpoint session. Skipping... ") + else: jobs_acls = util.read_log('acl_jobs.log', checkpoint) jobs_df = util.create_jobs(jobs_data, jobs_acls) - util.save_to_csv(jobs_df, "jobs.csv") - except: - print("Error while trying to read jobs. Skipping...") + util.save_to_csv(jobs_df, "jobs.csv", destination) # shared - try: - shared_df = util.create_shared_logs(checkpoint, directory_name = "artifacts/Shared") - util.save_to_csv(shared_df, 'global_shared_logs.csv') - except: - print("Error while trying to read shared directory. Skipping...") + shared_df = util.create_shared_logs("artifacts/Shared", checkpoint) + util.save_to_csv(shared_df, 'global_shared_logs.csv', destination) # other artificats - try: - other_df = util.create_other_artifacts(checkpoint, directory_name = "artifacts") - util.save_to_csv(other_df, "global_logs.csv") - except: - print("Error while trying to read global artifacts. Skipping...") + other_df = util.create_other_artifacts("artifacts", checkpoint) + util.save_to_csv(other_df, "global_logs.csv", destination) # libraries - try: - libraries_data = util.read_log("libraries.log", checkpoint) + libraries_data = util.read_log("libraries.log", checkpoint) + if libraries_data == 1: # not found + print("libraries.log not found in checkpoint session. Skipping...") + else: libraries_df = util.create_libraries(libraries_data) - util.save_to_csv(libraries_df, "libraries.csv") - except: - print("Error while trying to read libraries. Skipping...") - + util.save_to_csv(libraries_df, "libraries.csv", destination) + # secret scopes - try: - scopes_df = util.create_scopes(checkpoint, directory_name = 'secret_scopes') - util.save_to_csv(scopes_df, "secret_scopes.csv") - except: - print("Error while trying to read secrets. Skipping...") + scopes_df = util.create_scopes("secret_scopes", checkpoint) + util.save_to_csv(scopes_df, "secret_scopes.csv", destination) # metastore - try: - metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore') - util.save_to_csv(metastore_df, "metastore.csv") - except: - print('Error while trying to read metastore. Skipping..') + metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore') + util.save_to_csv(metastore_df, "metastore.csv", destination) - create_spreadsheet.csv_to_excel("./csv") + create_spreadsheet.csv_to_excel(f"./{destination}") print("Sucessfully created spreadsheet asset_mapping.xlsx. ") if __name__ == "__main__": all_args = argparse.ArgumentParser() - all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export") + all_args.add_argument("--checkpoint", "--session", dest="checkpoint", default="", help="set if you are using a checkpoint during export") + all_args.add_argument("--destination", dest="destination", default="csv", help="destination of converted logs (default: /csv)") args = all_args.parse_args() - main(args.checkpoint) + main(args.checkpoint, args.destination) diff --git a/utils/to_csv.py b/utils/to_csv.py index 1c4f584d..8afd76c1 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -10,14 +10,14 @@ def read_log(file_name, checkpoint): return data[:-1] except FileNotFoundError as e: - return '' + return 1 except Exception as e: print(f"Error while reading {file_name}...") return '' -def save_to_csv(data, file_name): +def save_to_csv(data, file_name, destination): try: - pd.DataFrame.from_dict(data).to_csv("./csv/" + file_name) + pd.DataFrame.from_dict(data).to_csv(f"./{destination}/{file_name}") except: print(f"Error while writing {file_name}...") @@ -73,13 +73,12 @@ def read_group(group_path): data = f.read().split("\n") return data except FileNotFoundError as e: - return '' + return 1 except Exception as e: - print(f"Error while reading {group_path}...") - print(e) - return '' + print(f"Error while reading group at path {group_path}: {e}") + return 2 -def create_groups(checkpoint = "", directory_name = "groups"): +def create_groups(directory_name = "groups", checkpoint = ""): groups_path = f"./logs/{checkpoint}/{directory_name}/" groups_dir = os.listdir(groups_path) groups = {} @@ -90,24 +89,26 @@ def create_groups(checkpoint = "", directory_name = "groups"): group_users = [] data = read_group(groups_path + g) + if data == 1: # group not found + print(f"Group {g} not found in the checkpoint. Skipping...") + continue # to next group + if data == 2: # unknown error + continue data = data[0] d = json.loads(data) group_name = d['displayName'] - try: + if 'roles' in d.keys(): roles = d['roles'] for role in roles: group_roles.append(role['value']) - except: - pass - try: + if 'members' in d.keys(): members = d['members'] for member in members: - group_members.append(member['display']) - group_users.append(member['userName']) - except: - pass + group_members.append(member.get('display', 'display not found')) + group_users.append(member.get('userName', 'userName not found')) + groups[group_name] = [group_roles, group_members, group_users] results = {} @@ -179,33 +180,42 @@ def create_jobs(data, jobs_acls): except Exception as e: print("Error in creating jobs...") - for a in jobs_acls: - try: - a = json.loads(a) - for j in a['access_control_list']: - if j.get('user_name', None) != None: - if j['all_permissions'][0]['permission_level'] == 'IS_OWNER': - job_owners.append(j['user_name']) - except: - job_owners.append('') + if jobs_acls != 1: # if it was found in the session + for a in jobs_acls: + try: + a = json.loads(a) + for j in a['access_control_list']: + if j.get('user_name', None) != None: + if j['all_permissions'][0]['permission_level'] == 'IS_OWNER': + job_owners.append(j['user_name']) + except: + job_owners.append('') return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_creators, 'job_owner': job_owners, 'instance_profile': instance_profile} -def create_shared_logs(checkpoint = "", directory_name = "artifacts/Shared"): +def create_shared_logs(directory_name = "artifacts/Shared", checkpoint = ""): shared_path = f"./logs/{checkpoint}/{directory_name}" - notebooks = os.listdir(shared_path) - + try: + notebooks = os.listdir(shared_path) + except: + notebooks = [] + if not notebooks: + print("Shared directory not found in checkpoint session. Skipping...") return {"notebook_names" : notebooks} -def create_other_artifacts(checkpoint = "", directory_name = "artifacts"): +def create_other_artifacts(directory_name = "artifacts", checkpoint = ""): other_path = f"./logs/{checkpoint}/{directory_name}" - notebooks = os.listdir(other_path) - if "Users" in notebooks: - notebooks.remove("Users") - if "Shared" in notebooks: - notebooks.remove("Shared") - + try: + notebooks = os.listdir(other_path) + if "Users" in notebooks: + notebooks.remove("Users") + if "Shared" in notebooks: + notebooks.remove("Shared") + except: + notebooks = [] + if not notebooks: + print("Top level folders not found in checkpoint session. Skipping...") return {"global_folder_names" : notebooks} def create_libraries(data): @@ -213,21 +223,18 @@ def create_libraries(data): library_names = [] for d in data: if len(d) > 0: - try: - d = json.loads(d) - library_paths.append(d['path']) - library_names.append(d['path'].split("/")[-1]) - except Exception as e: - print("Error in creating libraries...") + d = json.loads(d) + library_paths.append(d['path']) + library_names.append(d['path'].split("/")[-1]) return {'library_paths': library_paths, 'library_names': library_names} -def create_scopes(checkpoint = "", directory_name = "secret_scopes"): +def create_scopes(directory_name = "secret_scopes", checkpoint = ""): try: secrets = os.listdir(f"./logs/{checkpoint}/{directory_name}/") return {"secret_scopes" : secrets} except: - print("Error while reading secrets directory...") + print("secret scopes directory not found in checkpoint session. Skipping...") def create_mounts(data): mount_paths = [] @@ -246,6 +253,15 @@ def create_mounts(data): def create_metastore(checkpoint = "", directory_name = 'metastore'): metastore_path = f"./logs/{checkpoint}/{directory_name}" - metastore_database = [i for i in os.listdir(metastore_path)] - - return {'metastore_database' : metastore_database} + try: + metastore_database = [i for i in os.listdir(metastore_path)] + except: + print("metastore directory not found in checkpoint session. Skipping...") + return + tables = [] + for db in metastore_database: + db_path = metastore_path + '/' + db + metastore_tables = [(db, tb) for tb in os.listdir(db_path)] + tables.extend(metastore_tables) + + return {'metastore' : tables} From 026bc7dd5ea29da65ec68b133c40a037ec1f7e91 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 10 Jul 2023 10:11:13 -0400 Subject: [PATCH 032/111] fixing grammar issue --- library_migration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library_migration.py b/library_migration.py index af07a42d..2941c1d4 100644 --- a/library_migration.py +++ b/library_migration.py @@ -157,7 +157,7 @@ def main(): print(f"{datetime.now()} EXPORTING LIBRARIES... ") libraries_data, no_libraries = export_pipeline(old_dbclient, new_dbclient) print() - confirm = input(f"Import from? (y/N) ") + confirm = input(f"Import? (y/N) ") if confirm.lower() in ["y", "yes"]: print(f"{datetime.now()} IMPORTING LIBRARIES... ") import_pipeline(new_dbclient, libraries_data) From dee67cba928d0f69ddc17c806f5776da554a9ddd Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Mon, 10 Jul 2023 10:13:12 -0400 Subject: [PATCH 033/111] adding cluster policies to sheet --- convert_all_logs.py | 10 +++++++++- utils/to_csv.py | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index 58ee834b..f4cf8974 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -48,6 +48,14 @@ def main(checkpoint, destination="csv"): clusters_df = util.create_clusters(clusters_data) util.save_to_csv(clusters_df, "clusters.csv", destination) + # cluster policies + cluster_policies_data = util.read_log('cluster_policies.log', checkpoint) + if cluster_policies_data == 1: #file not found + print("cluster_policies.log not found in checkpoint session. Skipping... ") + else: + clusters_policies_df = util.create_cluster_policies(cluster_policies_data) + util.save_to_csv(clusters_policies_df, "cluster_policies.csv", destination) + # job jobs_data = util.read_log('jobs.log', checkpoint) if jobs_data == 1: #file not found @@ -82,7 +90,7 @@ def main(checkpoint, destination="csv"): util.save_to_csv(metastore_df, "metastore.csv", destination) create_spreadsheet.csv_to_excel(f"./{destination}") - print("Sucessfully created spreadsheet asset_mapping.xlsx. ") + print("Successfully created spreadsheet asset_mapping.xlsx. ") if __name__ == "__main__": diff --git a/utils/to_csv.py b/utils/to_csv.py index 8afd76c1..2c31d0bf 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -151,6 +151,21 @@ def create_clusters(data): return {'cluster_id': cluster_id, 'cluster_name': cluster_name, 'creator_user_name': creator_user_name, 'policy_id': policy_id, 'instance_profile': instance_profile} +def create_cluster_policies(data): + policy_id = [] + policy_name = [] + + for d in data: + try: + d = json.loads(d) + policy_id.append(d['policy_id']) + policy_name.append(d['name']) + except Exception as e: + print("Error in creating cluster policies...") + + return {'policy_id': policy_id, 'policy_name': policy_name} + + def create_jobs(data, jobs_acls): job_ids = [] job_names = [] From 3e3c31eaf07d0e4df8d212e1879823cc21dd691e Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 16 Aug 2023 13:28:03 -0400 Subject: [PATCH 034/111] Added Terraform Exporter product notice (#10) Co-authored-by: Serge Smertin <259697+nfx@users.noreply.github.com> Co-authored-by: gregwood-db <42991536+gregwood-db@users.noreply.github.com> --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c3c5e7ac..0ebd2545 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Databricks Migration Tool +> **NOTE:** For a more extensive and maintained cross-workload migration solution, please use the [Databricks Terraform Exporter](https://registry.terraform.io/providers/databricks/databricks/latest/docs/guides/experimental-exporter), which creates Infrastructure-as-a-Code replicas for the entire manually-configured Databricks Workspaces. + This is a migration package to log all Databricks resources for backup and/or migrating to another Databricks workspace. Migration allows a Databricks organization to move resources between Databricks Workspaces, to move between different cloud providers, or to move to different regions / accounts. From 0f288fbd0588a0f661302109ea9f2c6715fc2747 Mon Sep 17 00:00:00 2001 From: Tejas Pandit Date: Thu, 24 Aug 2023 11:21:34 -0400 Subject: [PATCH 035/111] Notebook ACLs failure issue resolved --- dbclient/WorkspaceClient.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py index ffd61cce..f29af8a7 100644 --- a/dbclient/WorkspaceClient.py +++ b/dbclient/WorkspaceClient.py @@ -480,7 +480,6 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer if self.is_verbose(): logging.info("Skipped notebook path due to group exclusion: {0}".format(x.get('path'))) continue - if not checkpoint_set.contains(nb_path) and not nb_path.startswith(tuple(exclude_prefixes)): if self.is_verbose(): logging.info("Saving path: {0}".format(x.get('path'))) @@ -528,7 +527,7 @@ def _recurse_log_all_workspace_items(folder): if self.is_verbose(): logging.info("Skipped directory due to group exclusion: {0}".format(dir_path)) continue - + if not checkpoint_set.contains(dir_path) and not dir_path.startswith(tuple(exclude_prefixes)): num_nbs_plus = _recurse_log_all_workspace_items(folder) checkpoint_set.write(dir_path) @@ -583,6 +582,15 @@ def _acl_log_helper(json_data): futures = [executor.submit(_acl_log_helper, json_data) for json_data in read_fp] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) + + def get_users_groups_target(self): + users = self.get('/preview/scim/v2/Users?attributes=userName').get('Resources', []) + groups = self.get('/preview/scim/v2/Groups').get('Resources', []) + + users = [i['userName'] for i in users] + groups = [i['displayName'] for i in groups] + + return (users, groups) def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', dir_log_file='user_dirs.log', @@ -632,6 +640,26 @@ def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', end = timer() logging.info("Complete Repo ACLs Export Time: " + str(timedelta(seconds=end - start))) + def fix_acls(self, acl, groups_target, users_target): + new_acls = [] + for permission in acl: + try: + group_name = permission.get('group_name', None) + user_name = permission.get('user_name', None) + if group_name != None and group_name in groups_target: + new_acls.append(permission) + elif user_name != None and user_name in users_target: + new_acls.append(permission) + elif group_name != None and group_name not in groups_target: + logging.error(f"Group name {group_name} not found in target workspace, removing ACLs {permission}") + elif user_name != None and user_name not in users_target: + logging.error(f"User name {user_name} not found in target workspace, removing ACLs {permission}") + else: + logging.error(f"User name {user_name} or group name {group_name} has errors for ACLs {permission}") + except Exception as e: + logging.error(f"Failed at filtering permissions: {str(e)}") + return new_acls + def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): """ apply the acl definition to the workspace object @@ -645,7 +673,7 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): object_type = object_acl.get('object_type', None) obj_path = object_acl['path'] logging.info(f"Working on ACL for path: {obj_path}") - + users_target, groups_target = self.get_users_groups_target() if not checkpoint_key_set.contains(obj_path): # We cannot modify '/Shared' directory's ACL if obj_path == "/Shared" and object_type == "directory": @@ -683,6 +711,7 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): acl_list = object_acl.get('access_control_list', None) access_control_list = self.build_acl_args(acl_list) if access_control_list: + access_control_list = self.fix_acls(access_control_list, groups_target, users_target) api_args = {'access_control_list': access_control_list} resp = self.patch(api_path, api_args) From b8bc1d22016be0b3bcb0c2113fefee65ec80f86e Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 28 Aug 2023 08:34:11 -0400 Subject: [PATCH 036/111] Fix missing key for hipaa option when using export_db (#12) Co-authored-by: gregory.wood --- dbclient/ClustersClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 2a9be662..6e987dbc 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -13,7 +13,7 @@ def __init__(self, configs, checkpoint_service): self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) self.skip_missing_users = configs['skip_missing_users'] - self.hipaa = configs['hipaa'] + self.hipaa = configs.get('hipaa', False) create_configs = {'num_workers', 'autoscale', From 6c37aa9a19a47acd077b235e9a5c6d5d2a1dffdf Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 28 Aug 2023 08:35:36 -0400 Subject: [PATCH 037/111] Update HiveClient.py adding table name to failed import metastore --- dbclient/HiveClient.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbclient/HiveClient.py b/dbclient/HiveClient.py index a65ccd1f..e8a282cd 100644 --- a/dbclient/HiveClient.py +++ b/dbclient/HiveClient.py @@ -407,6 +407,7 @@ def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', v if not self.move_table_view(db_name, tbl_name, local_table_ddl): # we hit a table ddl here, so we apply the ddl resp = self.apply_table_ddl(local_table_ddl, ec_id, cid, db_path, has_unicode) + resp['table'] = db_name + "." + tbl_name if not logging_utils.log_response_error(error_logger, resp): checkpoint_metastore_set.write(full_table_name) else: From 446b671ce677e600af0ab68e86ec94e77a1e05b6 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 28 Aug 2023 10:52:10 -0400 Subject: [PATCH 038/111] Update to_csv.py --- utils/to_csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/to_csv.py b/utils/to_csv.py index 2c31d0bf..2f357f31 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -279,4 +279,5 @@ def create_metastore(checkpoint = "", directory_name = 'metastore'): metastore_tables = [(db, tb) for tb in os.listdir(db_path)] tables.extend(metastore_tables) - return {'metastore' : tables} + dbs, tbs = zip(*tables) + return {'databases' : dbs, "tables": tbs} From d2cf97a23ea038989667d61736aa8ff05a2cb0e9 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 28 Aug 2023 11:01:02 -0400 Subject: [PATCH 039/111] Update to_csv.py --- utils/to_csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/to_csv.py b/utils/to_csv.py index 2f357f31..8962f015 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -276,8 +276,8 @@ def create_metastore(checkpoint = "", directory_name = 'metastore'): tables = [] for db in metastore_database: db_path = metastore_path + '/' + db - metastore_tables = [(db, tb) for tb in os.listdir(db_path)] + metastore_tables = [(db, tb, db+"."+tb) for tb in os.listdir(db_path)] tables.extend(metastore_tables) - dbs, tbs = zip(*tables) - return {'databases' : dbs, "tables": tbs} + dbs, tbs, both = zip(*tables) + return {'databases' : dbs, "tables": tbs, "name": both} From 24f00251246c9399cf39e93f14ca7bfa2481ed76 Mon Sep 17 00:00:00 2001 From: Tejas Pandit <30959949+tejasnp163@users.noreply.github.com> Date: Wed, 6 Sep 2023 12:05:20 -0400 Subject: [PATCH 040/111] Update ClustersClient.py to add old cluster id and name (#13) --- dbclient/ClustersClient.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 6e987dbc..ef569e46 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -309,6 +309,8 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus if 'cluster_id' in cluster_conf: checkpoint_cluster_configs_set.write(cluster_conf['cluster_id']) else: + cluster_resp['old_cluster_id'] = cluster_conf['cluster_id'] + cluster_resp['old_cluster_name'] = cluster_conf['cluster_name'] logging_utils.log_response_error(error_logger, cluster_resp) print(cluster_resp) From d4187ce8d51c8c0d29c1e7dc2ccec3a97e8dcb66 Mon Sep 17 00:00:00 2001 From: Tejas Pandit <30959949+tejasnp163@users.noreply.github.com> Date: Thu, 14 Sep 2023 15:32:38 -0400 Subject: [PATCH 041/111] Update WorkspaceClient.py to optimize ACL bug fix code --- dbclient/WorkspaceClient.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py index f29af8a7..5bbcca7a 100644 --- a/dbclient/WorkspaceClient.py +++ b/dbclient/WorkspaceClient.py @@ -30,6 +30,9 @@ def __init__(self, configs, checkpoint_service): self.groups_to_keep = configs.get("groups_to_keep", False) self.skip_missing_users = configs['skip_missing_users'] self.skip_large_nb = configs['skip_large_nb'] + self.get_user_group = False + self.users_target = [] + self.groups_target = [] _languages = {'.py': 'PYTHON', '.scala': 'SCALA', @@ -673,7 +676,14 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): object_type = object_acl.get('object_type', None) obj_path = object_acl['path'] logging.info(f"Working on ACL for path: {obj_path}") - users_target, groups_target = self.get_users_groups_target() + + if not self.get_user_group: + logging.info(f"self.get_user_group: {self.get_user_group}") + users_target, groups_target = self.get_users_groups_target() + self.users_target = users_target + self.groups_target = groups_target + self.get_user_group = True + if not checkpoint_key_set.contains(obj_path): # We cannot modify '/Shared' directory's ACL if obj_path == "/Shared" and object_type == "directory": @@ -711,7 +721,7 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): acl_list = object_acl.get('access_control_list', None) access_control_list = self.build_acl_args(acl_list) if access_control_list: - access_control_list = self.fix_acls(access_control_list, groups_target, users_target) + access_control_list = self.fix_acls(access_control_list, self.groups_target, self.users_target) api_args = {'access_control_list': access_control_list} resp = self.patch(api_path, api_args) From 00a0040d0b95251746226507c0b4f2fa43f6e8a8 Mon Sep 17 00:00:00 2001 From: Tejas Pandit <30959949+tejasnp163@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:56:38 -0400 Subject: [PATCH 042/111] Update to_csv.py for better error handling --- utils/to_csv.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/utils/to_csv.py b/utils/to_csv.py index 8962f015..20016b63 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -79,9 +79,12 @@ def read_group(group_path): return 2 def create_groups(directory_name = "groups", checkpoint = ""): - groups_path = f"./logs/{checkpoint}/{directory_name}/" - groups_dir = os.listdir(groups_path) - groups = {} + try: + groups_path = f"./logs/{checkpoint}/{directory_name}/" + groups_dir = os.listdir(groups_path) + except Exception as e: + print(str(e)) + return {'group_name': [], 'group_roles': [], 'group_members': [], 'group_users': [] } for g in groups_dir: group_roles = [] From 6e6dc9c3cb2b1c75ba5189cc4328cdf56cb5d25d Mon Sep 17 00:00:00 2001 From: Allistair Cota Date: Thu, 5 Oct 2023 15:33:48 -0400 Subject: [PATCH 043/111] added option to convert all usernames to lowercase --- export_db.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/export_db.py b/export_db.py index 8a0d12ec..1bf3d371 100644 --- a/export_db.py +++ b/export_db.py @@ -227,7 +227,12 @@ def main(): start = timer() client = dbclient(client_config) #parse list list of e-mail mapping pairs. Format is: old1@email.com:new1@e-mail.com,old2email.com:new2@email.com - emailpairs = args.replace_email.split(',') + if args.replace_email == "ALL_LOWERCASE": + scim_c = ScimClient(client_config, checkpoint_service) + old_emails = scim_c.get_users_from_log() + emailpairs = [old_email + ":" + old_email.lower() for old_email in old_emails] + else: + emailpairs = args.replace_email.split(',') print(str(len(emailpairs)) +' emails found to replace') for emailpair in emailpairs: if len(emailpair.split(':')) < 2: From f7b25414774f3c20eef6021b7a7716fea5d4d8e1 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 12 Oct 2023 12:50:13 -0400 Subject: [PATCH 044/111] Update HiveClient.py - include view name in failed import logs --- dbclient/HiveClient.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbclient/HiveClient.py b/dbclient/HiveClient.py index 63d022e5..6ab806b3 100644 --- a/dbclient/HiveClient.py +++ b/dbclient/HiveClient.py @@ -440,6 +440,7 @@ def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', v db_name, view_name = unpack_view_db_name(full_view_name) local_view_ddl = metastore_view_dir + db_name + '/' + view_name resp = self.apply_table_ddl(local_view_ddl, ec_id, cid, db_path, has_unicode) + resp['view'] = full_view_name if not logging_utils.log_response_error(error_logger, resp): checkpoint_metastore_set.write(full_view_name) logging.info(resp) From ae99d6baf57aabd9414b8848e413e19978d24c5c Mon Sep 17 00:00:00 2001 From: Tejas Pandit <30959949+tejasnp163@users.noreply.github.com> Date: Wed, 18 Oct 2023 09:16:15 -0400 Subject: [PATCH 045/111] Update to_csv.py --- utils/to_csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/to_csv.py b/utils/to_csv.py index 20016b63..b7f28c9c 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -82,6 +82,7 @@ def create_groups(directory_name = "groups", checkpoint = ""): try: groups_path = f"./logs/{checkpoint}/{directory_name}/" groups_dir = os.listdir(groups_path) + groups = {} except Exception as e: print(str(e)) return {'group_name': [], 'group_roles': [], 'group_members': [], 'group_users': [] } From 0a2edda60f2fab22f82a710cc2177b3c1769a247 Mon Sep 17 00:00:00 2001 From: allistaircota Date: Wed, 18 Oct 2023 13:56:40 +0000 Subject: [PATCH 046/111] Filtering out DS_Store hidden file when listing metastore dbs --- utils/to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/to_csv.py b/utils/to_csv.py index b7f28c9c..362e7a64 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -273,7 +273,7 @@ def create_mounts(data): def create_metastore(checkpoint = "", directory_name = 'metastore'): metastore_path = f"./logs/{checkpoint}/{directory_name}" try: - metastore_database = [i for i in os.listdir(metastore_path)] + metastore_database = [i for i in os.listdir(metastore_path) if i != ".DS_Store"] except: print("metastore directory not found in checkpoint session. Skipping...") return From 098074fb6e3d3def7bddbea1cac1170ef036210d Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 24 Oct 2023 10:23:55 -0400 Subject: [PATCH 047/111] change cluster names, add databases to csv --- convert_all_logs.py | 20 +++++++-- data/aws_cluster_hipaa.json | 2 +- data/aws_cluster_table_acls.json | 4 +- data/aws_cluster_table_acls_hipaa.json | 2 +- data/azure_cluster.json | 2 +- data/azure_cluster_table_acls.json | 2 +- data/gcp_cluster.json | 2 +- data/gcp_cluster_table_acls.json | 2 +- utils/to_csv.py | 61 +++++++++++++------------- 9 files changed, 55 insertions(+), 42 deletions(-) diff --git a/convert_all_logs.py b/convert_all_logs.py index f4cf8974..90d56e89 100644 --- a/convert_all_logs.py +++ b/convert_all_logs.py @@ -38,6 +38,8 @@ def main(checkpoint, destination="csv"): # groups groups_df = util.create_groups("groups", checkpoint) + if groups_df == 1: + print("groups.log not found in checkpoint session. Skipping...") util.save_to_csv(groups_df, "groups.csv", destination) # clusters @@ -67,10 +69,14 @@ def main(checkpoint, destination="csv"): # shared shared_df = util.create_shared_logs("artifacts/Shared", checkpoint) + if shared_df == 1: #file not found + print("Shared notebooks not found in checkpoint session. Skipping... ") util.save_to_csv(shared_df, 'global_shared_logs.csv', destination) # other artificats other_df = util.create_other_artifacts("artifacts", checkpoint) + if other_df == 1: #file not found + print("Global artifacts not found in checkpoint session. Skipping... ") util.save_to_csv(other_df, "global_logs.csv", destination) # libraries @@ -83,18 +89,26 @@ def main(checkpoint, destination="csv"): # secret scopes scopes_df = util.create_scopes("secret_scopes", checkpoint) + if scopes_df == 1: + print("secret_scopes.log not found in checkpoint session. Skipping...") util.save_to_csv(scopes_df, "secret_scopes.csv", destination) - # metastore + # just databases + databases_df = util.create_database(checkpoint, directory_name = 'metastore') + if databases_df == 1: + print("metastore.log not found in checkpoint session. Skipping...") + util.save_to_csv(databases_df, "databases.csv", destination) + + # entire metastore metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore') + if metastore_df == 1: + print("metastore.log not found in checkpoint session. Skipping...") util.save_to_csv(metastore_df, "metastore.csv", destination) create_spreadsheet.csv_to_excel(f"./{destination}") print("Successfully created spreadsheet asset_mapping.xlsx. ") if __name__ == "__main__": - - all_args = argparse.ArgumentParser() all_args.add_argument("--checkpoint", "--session", dest="checkpoint", default="", help="set if you are using a checkpoint during export") all_args.add_argument("--destination", dest="destination", default="csv", help="destination of converted logs (default: /csv)") diff --git a/data/aws_cluster_hipaa.json b/data/aws_cluster_hipaa.json index 40b9a2ab..9ab7424a 100644 --- a/data/aws_cluster_hipaa.json +++ b/data/aws_cluster_hipaa.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration", "spark_version": "10.4.x-scala2.12", "aws_attributes": { "first_on_demand": 1, diff --git a/data/aws_cluster_table_acls.json b/data/aws_cluster_table_acls.json index 32113a54..94738c24 100644 --- a/data/aws_cluster_table_acls.json +++ b/data/aws_cluster_table_acls.json @@ -1,9 +1,9 @@ { "num_workers": 1, - "cluster_name": "E2_Migration", + "cluster_name": "E2_Migration_Table_ACLs", "spark_version": "10.4.x-scala2.12", "spark_conf": { - "spark.databricks.acl.dfAclsEnabled": "true", + "spark.databricks.acl.dfAclsEnabled": "true" }, "aws_attributes": { "first_on_demand": 1, diff --git a/data/aws_cluster_table_acls_hipaa.json b/data/aws_cluster_table_acls_hipaa.json index 781f376f..46100617 100644 --- a/data/aws_cluster_table_acls_hipaa.json +++ b/data/aws_cluster_table_acls_hipaa.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration_Table_ACLs", "spark_version": "10.4.x-scala2.12", "spark_conf": { "spark.databricks.cluster.profile": "serverless", diff --git a/data/azure_cluster.json b/data/azure_cluster.json index e2054c51..61c42ebf 100644 --- a/data/azure_cluster.json +++ b/data/azure_cluster.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "API_Metastore_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration", "spark_version": "10.4.x-scala2.12", "spark_conf": {}, "node_type_id": "Standard_D8_v3", diff --git a/data/azure_cluster_table_acls.json b/data/azure_cluster_table_acls.json index 7b6e3206..7163cc02 100644 --- a/data/azure_cluster_table_acls.json +++ b/data/azure_cluster_table_acls.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration_Table_ACLs", "spark_version": "10.4.x-scala2.12", "spark_conf": { "spark.databricks.cluster.profile": "serverless", diff --git a/data/gcp_cluster.json b/data/gcp_cluster.json index 3a7c07e6..0f0a70c9 100644 --- a/data/gcp_cluster.json +++ b/data/gcp_cluster.json @@ -1,6 +1,6 @@ { "num_workers": 1, - "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration", "spark_version": "10.4.x-scala2.12", "gcp_attributes": { "first_on_demand": 1 diff --git a/data/gcp_cluster_table_acls.json b/data/gcp_cluster_table_acls.json index 062c24ba..3b275404 100644 --- a/data/gcp_cluster_table_acls.json +++ b/data/gcp_cluster_table_acls.json @@ -1,5 +1,5 @@ { - "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone", + "cluster_name": "E2_Migration_Table_ACLs", "spark_version": "10.4.x-scala2.12", "spark_conf": { "spark.databricks.cluster.profile": "serverless", diff --git a/utils/to_csv.py b/utils/to_csv.py index 362e7a64..647d7609 100644 --- a/utils/to_csv.py +++ b/utils/to_csv.py @@ -79,13 +79,12 @@ def read_group(group_path): return 2 def create_groups(directory_name = "groups", checkpoint = ""): - try: - groups_path = f"./logs/{checkpoint}/{directory_name}/" - groups_dir = os.listdir(groups_path) - groups = {} - except Exception as e: - print(str(e)) - return {'group_name': [], 'group_roles': [], 'group_members': [], 'group_users': [] } + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 + + groups_path = f"./logs/{checkpoint}/{directory_name}/" + groups_dir = os.listdir(groups_path) + groups = {} for g in groups_dir: group_roles = [] @@ -214,27 +213,21 @@ def create_jobs(data, jobs_acls): def create_shared_logs(directory_name = "artifacts/Shared", checkpoint = ""): + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 shared_path = f"./logs/{checkpoint}/{directory_name}" - try: - notebooks = os.listdir(shared_path) - except: - notebooks = [] - if not notebooks: - print("Shared directory not found in checkpoint session. Skipping...") + notebooks = os.listdir(shared_path) return {"notebook_names" : notebooks} def create_other_artifacts(directory_name = "artifacts", checkpoint = ""): + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 other_path = f"./logs/{checkpoint}/{directory_name}" - try: - notebooks = os.listdir(other_path) - if "Users" in notebooks: - notebooks.remove("Users") - if "Shared" in notebooks: - notebooks.remove("Shared") - except: - notebooks = [] - if not notebooks: - print("Top level folders not found in checkpoint session. Skipping...") + notebooks = os.listdir(other_path) + if "Users" in notebooks: + notebooks.remove("Users") + if "Shared" in notebooks: + notebooks.remove("Shared") return {"global_folder_names" : notebooks} def create_libraries(data): @@ -245,20 +238,17 @@ def create_libraries(data): d = json.loads(d) library_paths.append(d['path']) library_names.append(d['path'].split("/")[-1]) - return {'library_paths': library_paths, 'library_names': library_names} def create_scopes(directory_name = "secret_scopes", checkpoint = ""): - try: - secrets = os.listdir(f"./logs/{checkpoint}/{directory_name}/") - return {"secret_scopes" : secrets} - except: - print("secret scopes directory not found in checkpoint session. Skipping...") + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 + secrets = os.listdir(f"./logs/{checkpoint}/{directory_name}/") + return {"secret_scopes" : secrets} def create_mounts(data): mount_paths = [] mount_sources = [] - for d in data: try: d = json.loads(d) @@ -266,11 +256,20 @@ def create_mounts(data): mount_sources.append(d['source']) except Exception as e: print("Error in mounts...") - return { 'mount_paths' : mount_paths, 'mount_sources' : mount_sources } +def create_database(checkpoint = "", directory_name = 'metastore'): + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 + metastore_path = f"./logs/{checkpoint}/{directory_name}" + return {'databases': [i for i in os.listdir(metastore_path) if i != ".DS_Store"]} + + def create_metastore(checkpoint = "", directory_name = 'metastore'): + if directory_name not in os.listdir(f"./logs/{checkpoint}/"): + return 1 + metastore_path = f"./logs/{checkpoint}/{directory_name}" try: metastore_database = [i for i in os.listdir(metastore_path) if i != ".DS_Store"] From 47e37362719c22a3d72859edf67c0a39c2e50f02 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 24 Oct 2023 11:08:12 -0400 Subject: [PATCH 048/111] split on databases + tables --- utils/split_logs.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/utils/split_logs.py b/utils/split_logs.py index 0baa444b..5c5f115a 100644 --- a/utils/split_logs.py +++ b/utils/split_logs.py @@ -509,18 +509,25 @@ def acl_directories(self, df, file_name="acl_directories.log"): self.write_logs(data_write, file_name) return errors - def metastore(self, df, file_name=None): + def metastore(self, df, file_name=None, split_tables=False): databases = os.listdir(self.path + "metastore/") errors = {'Data':[], 'Error':[]} - - for db in df['metastore_database']: - try: + for dbtb in df['both'].tolist(): + try: + db = dbtb.split(".")[0] if "metastore" not in os.listdir(self.new_path): os.mkdir(self.new_path+"metastore/") new_folder_path = self.new_path+"metastore/"+db src_path = self.path+"metastore/"+db - if db not in os.listdir(self.new_path+"metastore/"): - shutil.copytree(src_path, new_folder_path) + if split_tables: + tb = dbtb.split(".")[1] + new_file_path = new_folder_path + "/" + tb + src_file_path = src_path + "/" + tb + if tb not in os.listdir(new_folder_path): + shutil.copyfile(src_file_path, new_file_path) + else: + if db not in os.listdir(self.new_path+"metastore/"): + shutil.copytree(src_path, new_folder_path) except Exception as e: errors['Data'].append(db) errors['Error'].append(e) @@ -537,7 +544,7 @@ def success_metastore(self, df, file_name='success_metastore.log'): d = d.strip() d = json.loads(d) database = d['table'].split(".")[0] - if database in df['metastore_database'].tolist(): + if database in df['databases'].tolist(): data_write.append(d) except Exception as e: errors['Data'].append(d) @@ -556,7 +563,7 @@ def database_details(self, df, file_name="database_details.log"): d = d.strip() d = json.loads(d) database = d['Namespace Name'] - if database in df['metastore_database'].tolist(): + if database in df['databases'].tolist(): data_write.append(d) except Exception as e: errors['Data'].append(d) @@ -576,7 +583,7 @@ def table_acls(self, df, file_name="logs/table_acls/00_table_acls.json.gz"): if len(d) != 0: d = d.strip() d = json.loads(d) - if len(df.loc[(df['metastore_database'] == d['Database'])]) > 0: + if len(df.loc[(df['databases'] == d['Database'])]) > 0: data_write.append(d) except Exception as e: errors['Data'].append(d) From aaa6dc60722e8a7ecd7c39b1f07cec4c89418b95 Mon Sep 17 00:00:00 2001 From: Tejas Pandit <30959949+tejasnp163@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:24:51 -0400 Subject: [PATCH 049/111] Restrict the job renaming to imported jobs with ::: only --- dbclient/JobsClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py index e9298a27..262cd028 100644 --- a/dbclient/JobsClient.py +++ b/dbclient/JobsClient.py @@ -69,7 +69,7 @@ def update_imported_job_names(self, error_logger, checkpoint_job_configs_set): job_name = job['settings']['name'] # job name was set to `old_job_name:::{job_id}` to support duplicate job names # we need to parse the old job name and update the current jobs - if checkpoint_job_configs_set.contains(job_name): + if checkpoint_job_configs_set.contains(job_name) or (':::' not in job_name): continue old_job_name = job_name.split(':::')[0] new_settings = {'name': old_job_name} From ccdae7208985c118e5470bc28d92c59a92d336b4 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 7 Nov 2023 13:30:32 -0500 Subject: [PATCH 050/111] export single database --- dbclient/HiveClient.py | 7 +++++-- dbclient/parser.py | 3 +++ tasks/tasks.py | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dbclient/HiveClient.py b/dbclient/HiveClient.py index 6ab806b3..ba5d5822 100644 --- a/dbclient/HiveClient.py +++ b/dbclient/HiveClient.py @@ -274,7 +274,7 @@ def export_database(self, db_name, cluster_name=None, iam_role=None, metastore_d success_metastore_log_path, current_iam, checkpoint_metastore_set, has_unicode) def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', db_log='database_details.log', - success_log='success_metastore.log', has_unicode=False): + success_log='success_metastore.log', has_unicode=False, database=None): start = timer() checkpoint_metastore_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES) @@ -300,7 +300,10 @@ def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', d database_logfile = self.get_export_dir() + db_log if os.path.exists(success_metastore_log_path): os.remove(success_metastore_log_path) - all_dbs = self.get_all_databases(error_logger, cid, ec_id) + if database: + all_dbs = database + else: + all_dbs = self.get_all_databases(error_logger, cid, ec_id) resp = self.set_desc_database_helper(cid, ec_id) if self.is_verbose(): logging.info(resp) diff --git a/dbclient/parser.py b/dbclient/parser.py index 4ff90f09..c09fb403 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -520,6 +520,9 @@ def get_pipeline_parser() -> argparse.ArgumentParser: parser.add_argument('--skip-missing-users', action='store_true', default=False, help='Skip missing principles during import.') + parser.add_argument('--database', action='store', default=False, + help='Set a default job owner for jobs without an owner.') + # Pipeline arguments parser.add_argument('--session', action='store', default='', help='If set, pipeline resumes from latest checkpoint of given session; ' diff --git a/tasks/tasks.py b/tasks/tasks.py index 597dd3ed..9d142c54 100644 --- a/tasks/tasks.py +++ b/tasks/tasks.py @@ -295,7 +295,8 @@ def __init__(self, client_config, checkpoint_service, args, skip=False): def run(self): hive_c = HiveClient(self.client_config, self.checkpoint_service) hive_c.export_hive_metastore(cluster_name=self.args.cluster_name, - has_unicode=self.args.metastore_unicode) + has_unicode=self.args.metastore_unicode, + database=self.args.database,) class MetastoreImportTask(AbstractTask): From b00a44bea46d6748bb125e8a2aa68655a258d7ff Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Tue, 7 Nov 2023 14:52:04 -0500 Subject: [PATCH 051/111] changing databases param to accept list --- dbclient/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbclient/parser.py b/dbclient/parser.py index c09fb403..e795f5a9 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -520,8 +520,8 @@ def get_pipeline_parser() -> argparse.ArgumentParser: parser.add_argument('--skip-missing-users', action='store_true', default=False, help='Skip missing principles during import.') - parser.add_argument('--database', action='store', default=False, - help='Set a default job owner for jobs without an owner.') + parser.add_argument('--database', nargs="+", action='store', default=[], + help='list of databases to selectively export') # Pipeline arguments parser.add_argument('--session', action='store', default='', From 9df4ff6d448927e3bd0a1ca814d8b0cb48651e1f Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 30 Nov 2023 13:43:42 -0500 Subject: [PATCH 052/111] Add files via upload --- dbclient/WorkspaceClient.py.zip | Bin 0 -> 10884 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dbclient/WorkspaceClient.py.zip diff --git a/dbclient/WorkspaceClient.py.zip b/dbclient/WorkspaceClient.py.zip new file mode 100644 index 0000000000000000000000000000000000000000..7ed224397e46cc0ff8f38fcec8462295e34c2ccf GIT binary patch literal 10884 zcmbuFQ*b40l(l2KW7~F4Y}>Xwc23yo*zDM5$L5J`+ji1%&-Yc$)IT*fRdeyLeX-wt zv-VqS-##kxkWiRlaR1c)4?k_N{}k;Q7#Ic^DVVmCtG%1Eskx>2cUwzGcV=fFO$}r) zSYACV%Y-Ex%YVCvHzF7~)Dt)u*grmCU{`|nUK_lgI}Z#x?I$wVv})8jj=(OD87ubf zHFpjdJL^oxMml-fgehdnO3I^=x0l{OLbot7WAPRXSGVG3iG9${E-uDxZKK*3m5#}} z%Mr%@e<)9kPjuak4mqA0v6XWfFYHnotscm6EVD*Y1Iy$LYL7M7nsnE=&#}7FozC&) z+zcOp4Vz6_dYlO3oez2z(XE=Agocr?r8Rc7bMx;_V`=2w)el;Wjw=A$$4xA~Quv_qWrddb(c7LpEG?RLcB0x3*moDa# zL_l^oI74QhljVV1Q9713&Ad3*GGMunnp8h~LIi-r3F((tBGe8gB@DI~&F3BxkpsOU-kJ2h+1!e3B8Cd}#Y?X}m4Q}Z2TLgSX6J32+YkCXXh`Wd- z*%Vu^sFjD(I7a&Y2}7Z@^Ol;7VERz25Ut@u7_@0B^%B_fLCBZ{hu`|xB-lSQj!ko# zPq>N5BBLeOjq2e2i}tb(yrg$XNd5-!Duw+nk2C7w7kUNFDmWWzIv`Bigbrnx1Ti1t z6GML0LrYA#dv_}Dge_evn{9K0-GN5@t2S6oEIgM=6}OkMQe_)l6P=ZhSlwS6=ri=x zbe3ninfuT~aIx0xT1JzqLM+^e9Mo8z3ovfG*~NJda0_sl7CrBp?3tOq1=8S5_I5Iy^MF5q5~eUPmbe;g{m z4+*s)c*#%5{u+_!d|5Xqni>XIPOyB17nZFJ8z?1L$c;Zo?=I~PXg2NsqO7eS9xcF> zcgyJdD~r%o|Ni!J_@~UtxJy9JxSRit&fovT-1pyyWC=#lzy=hB8AA7pI)?+kkTW2N zvv=u9&q1z5nXn%EtCB~yj{R)Qfh-RG81t)g^R=DjNDj2T_VQ zD0FRpW%YxEda45qp~p^o`qB=}N0u^py-@rwFodWd$-IrYFILt$n55~CfzazZU&LNoyn6z+m}x~ zrIb)t??}gR&6d*XjRpUVgQyWKG%>KDW*E(2I1G(9aYPBzq~-6_)c~bF zqm+)j8jnLxopeF14ZQ}En|dfrVMSvKFt{8YPgVP%>9GiPyuvM^?82a6MV)-Zl+03bHsCE zl2e4ohafDnTZH-%kVybLM6?wNzyH&mQ5v+uuiBoxj|tQJYz64gYfxi+lg=ubi?#Uv zs;>1t(b|%}O5Ao>X8}HI&-$(wpQ3O$>r+I4Zew8!k>r-IaA%zL!IOvO@BQ5ggyVLA=!)A$YUfL zl4D;{vpTyHCBj!24KU_|#6KxJV9D?u;Id36R@uGEEN*<$ZWg(Jv=$xRm7d;4Ga;9N zJu1(V2j(HODm4)%TL+!@T%B!~w*k6<0R^@3oOaF_`J1=ai&y z5+%vk2h48Ptp;MV%`0^Jd;=RmM+3D*t$Ou)_wXt7JdMpUqj`sV=o7;dG`0@hyTrkx zRmMt^MS*Sra9Wu^Wza7=I!1AluEW$kZCv)=0U8sqcl$Csn|Qs2J9Cg!dAA|YmqF9e z&zfXch|CVOGW;S598x3D0>?;#9$Mlpsdn!_3_CWBh^bddK;{vs+pcaK_rRZn9uJqE zfNYsxA!#N(GmfM*X@;sQ*7=nZBNj`r`$m>|P=MhB%S7Y>e7P#+@KFF0==GQX&)zhD zsIi~kb(fApz-}m^JkS@*aY$=ud@Ol_ygnlghJD|%$ek$GF`)&;DaT4$fP|~iNzI<@ zeG3|%NE;egl)J@@+XvGXc|An^_cMc1mb1dsH7pbT(*#+qcAre!(>s#iBRh|AJn%uu z>7|aa8&yn?X~F7PzP2l#^{+hP*WLz`m1*1Fh9JAr=19)hF^*~am^YaMFxh3XaDiuh z0<=ceW?&r;*m7a%EWD0GG1Dfk)OXP6acI9ksx=-jv`YzcP_%)k7>0~09defE?v)AY zv5HnO>@9N1W*Li7Pg<8bAx@-V_8==NuDE_8NWcPLToB+MEpa4*l=Y<#+^vOZ3iTNb z3a=7U!fa7(MXoZett;Xo7#G8U^9!Q=`cr4zuC+1fu!>M20FfM@{ig zxt76}gVcPdEAJ(f4bqG;lCUY-tKr*yz5E29tFWmQrNNc1c%qL$=UQ10P9>=8BtRyT z8mL+_jD-?$;b>m9=jd*KeSJJVoQlqib95?`g)D*28dFKpVRzC^vpFkT0JICttRJ%5 zH$8PRI4ifUIoN(Nwc;{G@6XlXb8)L@C_1G(?wFqX9t#(_P{8l!4#%#~N-$BUzKpO| zMi3yA*hYMaa+*J~Og5@r$v|ezgg&b^@XhHkDq_Y)43JaV4P~XA#$?0y(K&EcbYHJh z+B?weGb&WM>iZd5FzHE&got=UqMF8kAJ={94WauZH}t2{`)4PfI|7M*sh|o1vh<(> zy4q>h6jJ-O4)dykl(Lc4P+8(uj5NFeUxUAAJZYCK6WF`ko(A+au^aV#(;eZ$HA3CV zNtdd*%P|}O?}Xsp#}ms_wI(2FEBNk}#T-cX;tnoZVs!P)|E>n(JV@3wGHpPJ0>KJi zsP2+lvmE*o`;jLm;r*L1y1u+xP-otCEE2;dg8L#x{P)skT~%22G+~XyP*i5^c38iv zLNs>UtYu4aj$?7ms?40Mj3_;(YSCtb!lLh(*;PI>2W3=Kf~xg6_#YZp3~7bTiQf}I z-}p@UaC!>MMKjF;r*FvMMiSJ^)Ikv6Yl$n(4ijYx>G7ljrb$r( zN3rc_1j=(J8PQ-$ogcKbVFR4MVUTIQ3_SD2b`I8p3%rCxBsC0m0Dc4hb@{Fnb3%_**zQ54mp;F$XFyg(yOvkW;~|dlzc;+;#>7H z;oiYCaEQZ@h`sP9ZLO-eiuK?O?9aEa(Xe*^KAAi(h1$p4+uhr&i9xS5>B_aMGsV zVTX;&Q`N{*Wd0TnjrR$lyvybBSVi&>U>AJai~zp&njoASuwyXrdgBz%*4GK6$tUq3XEE1nzA-x4Uh^8i zFRQZPWt0k-f`ro?8BInel`!M+`C{KTf5Db@e^b)^ri2sS;T9E_Otz5$stqBYdmtiu zE=X}s?lRCS3I?Ds|U;-C7< zc+1+17ES&7yyo@y5%up&l{GvOLe$IIrjJ_I(AX5yE9^=eNf&z4ERqf;g+x?X<=#)Y0S%8In|6Xga2MJZ;PKM@J(HU3W`jsLT?Np5 zGuC9r#PNoV)C8sZ)+3{BzMQ{7`LHUmW6NW|LL-qh>^zbKV4;i&8)y^zOt2+H934GL zYvw36?a_@JIdjMMoL-;9)3kHmwAO#i+}QE+Z_Oy;GM`iScq?fG?{tdG>g3A{($Eqy)Gk>9Fie-?rgVoUy*{sIg&@<95-Lo-waohUc`;->4qgBrOS;P_fjot(L zymVWL#=-P{#5E}!0DUX))Y|H+tkT-6-xK*RoZlDwL5PEG$lzW@L7E~MQv7lbFtoH! z6+3wAOStnWr`>kK6n7ZIFF#I&@(04Mimu1YEThUluqK<>1PBWhob8TA_4h4`D19eq zV>|M5`+?o*@2n477J`o#Msi`du9_IDr4L9b4KdwCyRR0J9D81_@V=6|J_&)U;h`e3 zf~smMSb$`EVBoa5943r;Y%hVB6ufhp!#OZ1Vwi6koL9QqzB7QqTyM|n#>KDJE1o@M~JbIn>w&Cc6x2=Ov^ty0PgX~{fdVYiu zcqW2@O!Hf>wp-j9JDi}8k5ASXVM!tCr!9*UW*o$UE>@%DB}+r?FS1WGErbvZ5lgpO!P1Tuim8RS z<%K5pPmG?h<(Sc4h(2Kt(3xM3G$|`9Wl{t`P*l^;q2ths5)0Y zas#AFKH+o>8T2A|-QIB(L;~-o7s#EKB|QaKSNx2R6B5U8(;nj{=IJJ_e*fKDlMZkR zuI&Zqi`8gFLkN~;Ev0W2T#Ya5U7Cgz!XgEBpngDc5Fof##N66L#S4LAO#^^0RlWvF zp7gELn^RWpiNi&dInW2?hMp&A+LTS&f`Bd|DXxfb+R_c4N!H6sNL26{V~ku5TCK=> zBSoH#%!9PEO8hF$JQh9? z$iNv=Fs|LgDunS&wLmst-u!l6W?)a{M$ZX7r;to9p^N);;^_M*0M(y83jU;oB=r0Q zzL9NKsgJrb{N8oV_UVTroDo30V_KRw>lOy}%$mOC*!b={@z@x$!5E~xSCq{J52?_J z%`DW(*ljZeM6eZJmKOBFTI8?YEpJ-(;I9%AF}ne*b;5fJh5=9v)GSLUM$VQG(t&gOLr6`lL>0VNIxBr0RmIl@!=q(EVbqjPi6^ z8*FZz(TN+)i^R|q!_3g4GnGJ{d=_*%gjeKK)57)+wKKq9qH;x(CxnDC2+v%q>kHq8{sX*CjNd29E!f25pM(86G zacj{C`HJ%&>~@}GrYr=MO(9|a;Ye4S3i3QK54#;D3=Fx`Og7kpq%jMOSmNgCgF-UO z_{hp4#r=_v^s3#ZuL^S**MCO|x zW6M7DoRH5_tSi!KoGc9+5x-gn&Zb67qQ?jxi6GB45z#Z4 zWl5Lw#ya%k!%iGJm0qvKg)VoA*sjq~I%Jth-MPNyU#d9lq<+(w6()ZHB?fd0;2iDkw$>>r?UYFWy}LiX>eL=_f+--z>3IezjNf(4mu8-%6tByA z%{8vgXn#9{T(@{nFRJd6JO{b79+{)X;NF1k<+BEf%2gOY+<841f^2|tveD(Tszbo# zTTdN^grmPoQlo~ACZ2Ri8;$1IX&SS0i)Lief$dPqm1v{=qZ*xy4{Et6N{?)WY8;jw zCtiGZ{*WRBQjZtF5b$=^oakdTeeZFJ&<$mNRU7Gm38F}iy4gB@wvvq{pxRlXu=+`y zdxx{DVMjq_z2ln`3DwxqQ_<`xD)yP5BRiE*0MFZIGJq%|0*G<4y*2hNB<@-)LyP7{ zHZ5&?7vnHjJ_#r&Q87?N=mU7}BNtQgTL^`SnpHv08UnlHg z#}x$EJ;a5K`^RZ>O%xgpZD2PEHG-P*9!hgL1+b2zhb<4>eVEjeJV)6a7F^TnX?5G=RAC56iA7 zh!a8#`0kn{qu3G6rH9EgK_6>UE#qM1&MufYfZ&WB_+iP>R08Esr`fDTB5FuoTa)mn z3VBvo{8@Ch8mhoIiIEAC&s!FVDJntK3E^;3)!~|+0VTM2i zFt3R=tVUg=OMB!nv>&~sv4u-o{0y!qS1x)lWIxacsxmKO{fgpW1i40?-Np{A!j#Nv z#zGepf}@*Zg;&gI3ygdJqm>$S z@@WWkwZg1r&PY2^nS>dt^k|=qw&85?0CfR=Zho#Q#5vi7d9$xExGP-nL5JkWw9#=Q z2ECFBx$u?yHqNetJQ`)fgd{nIZU_o2HZEwQVk}kCgK;3wx7V-F#^>E{#(^)JqIJI| z!#s9C)5KS-3%A@O4Hq+URJmI){dWq18>udRhuYlmLGt`*Ka;wqYsbPB^)5rNl+!Fz zwMewrx*0n=v$2*v%zbih5-WlPKluBcCmqGNj01&iw&WPOQ1GPV10dd}bK)(JFXhPu z^NHF-oe<7gvR~Qvy>Mwdk81@5NzY)J_1juUxN!E6_P=k!jOicNH&M6o;#Vu*`|62oS=>H;xOM9=w`@Ku2EX#H zZOxo2F?AqJbU;OzFIh@W2>2#SiB~p^9eY)D7l0GqTysp{r?v*+p`=YBDA|RFAUFoO zx%x`RDO@bPYgs|~j4fZKg?wZal9{oM!!7>cs8SE^ksq?Sv-aG#Tj4q1{vwFp zrFvwdG)yj4RZwP;6|Y`!c)aG|t-M+jf)2HkH9QSCB9Vl&(!XV#IBdluof1d&oDx+& z91M$z-k`eHS@M)Nw?;4RiiR;sS6uZ&WJh}#<^O;}@6F_Jd>7v|vd(4yU@C(Z7U@mJ zy0G?O^vzN&rUNqjH5##Hd*9;`1*|2Lc2}I}GY2;oq^G*keC`JJ>JU(a=MBO#R!XjHc{gb9HG5B&_#%-V z+i3r$(6s>Md&8$Q=~(?8HG}BJ*%VxVazoEG=?~zP-s-+?ar>hZU?j~kz(T=jGLzP> zpoeL0bI&w}$P@7Lb!6RvA1?-r-_ttSgq+l~sv`7ADrYR{zq0@7O17W5&@h3G$I+h|Y4R>X7uUkN-(%VQQws%gH6(!)# zkk-#qvTIv%HZmIWwKbE`8>?l1c2E=|jo$DK`CDZFk)6I?F2V6@PS38MI&Cuyi%485q5~V};!U<-H-qvLi}IN=$LXj9nGRFFu9$n1dYF7Rw~UU+jM019DTXSqCot zL@lgsR5kbODKOHMjAySUF0iInWK#zJ_R^=PY41y?AI&RdbaZW2@3K)FrOphBEGB9I;nMA)Nw#-mG*irM#la0 z57GH0FhgxrNGdBH{C!z+J8rQK!y;E`1iyXy4aA`xbBoK_&G)Gh+n&9K1!JTlkK|M= zP>tg~Nfo*#%jeR$b*rP=^{Z6Lagok1M(oxcbxGfI{L8G<;tK_j-9&OBSe<65Zi#&e zksEuQ?U({?g13i*Am~EF?WP?5f{1$N4TGhpf3}K4$Ie~-=l)1!TmagUq>=U!f`8(9 zl=s-5_FIkP_TA{OAL@l)9;k-M>qzT%Zo5cMm~l6G6syOeONuBqH)vHSlyQooFtB|j zr?*`w4XG0wfhCUbyreGt33_qYD6=^Q&k(}s%6CTKIlCXdc|xh;f%M^DJ7W!_(@jH| zseQ2`d+U1CFUw~!@ll&~eQ+wm6XVxp=3eQU-v?Bat!R6jzo9P3n%l}3s7)}^9CzN9 z0(sq;p7Ua#%eaJVqzioG;mxI8F`kjtsJm4HUBW20&^x>15^B>8tJEgxA5y2v-^@Pt zcpFhdJ!t+q58DOUXrS_i(B#?Ai%vt9 zju_9CRB{(sVxrd#(({UAg?VR|#Ne8sz09mau`U*3@1CH%qDn8ya{9AyTsg^4&4a@O!y1VrQP4Bh zsW*~dH^Z6O_=jUka5}UmDW1%^os7DceRKFT_9GC&^Lmj|lmw(Uvhl@8Li6YA;$B%F znh`bA_INI59YsWqYg8_gX?&1+VpQT|e{cJIZ~Lf2e`ID;^4r`~E(-;nq zu4bho-#Arqd3qAu2?ENpjzv0q6`>uGn$>xdk?WNgEo^Xif9HNT zzJQ!^ArzHaGov}SgEk1EY_-u3_xq{vznK9sU?`PWY3BP>%*Z`LBcRFH9WgKFP^-^P zvVuL0ECqGXaFK8)7#Yowko*&L*!V&DzhwI-8%}T?(cQNn{c-C@yL`jY%}kCH`8|Vv zCtw^0o{YoH+4No8K9okiL>N9ZU*^}KL9Y4^| zU8yHpt%|9hwZQgN^$E!h9x)l}r6Vg9v)_@^fR=|yz^ GPX7T>)usFZ literal 0 HcmV?d00001 From 59e8f9f8e8981503361a2bcc0b8b7e5aefda7e0c Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 1 Dec 2023 11:39:41 -0500 Subject: [PATCH 053/111] adding DBFS sizing notebook --- data/notebooks/DBFS Sizing Notebooks.py | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 data/notebooks/DBFS Sizing Notebooks.py diff --git a/data/notebooks/DBFS Sizing Notebooks.py b/data/notebooks/DBFS Sizing Notebooks.py new file mode 100644 index 00000000..66205414 --- /dev/null +++ b/data/notebooks/DBFS Sizing Notebooks.py @@ -0,0 +1,44 @@ +# Databricks notebook source +def recursiveDirSize(path): + total = 0 + dir_files = dbutils.fs.ls(path) + for file in dir_files: + if file.isDir(): + total += recursiveDirSize(file.path) + else: + total += file.size + return total + + +# COMMAND ---------- + +dbfs_paths = dbutils.fs.ls("dbfs:/") + +paths = [] +sizes = [] + +skip_paths = ["dbfs:/mnt/", "dbfs:/databricks/", "dbfs:/databricks-datasets/","dbfs:/databricks-results/"] + +for p in dbfs_paths: + try: + print("Working on", p.path) + if p.path in skip_paths: + continue + p_size = recursiveDirSize(p.path) + paths.append(p.path) + sizes.append(p_size) + print("Completed", p.path) + except: + print(f"Could not find size for path {p}") + +# COMMAND ---------- + +spark.createDataFrame([(i, j/1e6) for i, j in zip(paths, sizes)], schema = ["Path", "Size in MB"]).display() + +# COMMAND ---------- + +spark.createDataFrame([(i, j/1e9) for i, j in zip(paths, sizes)], schema = ["Path", "Size in GB"]).display() + +# COMMAND ---------- + +dbutils.fs.ls("dbfs:/") From fe2002dd289ae55eb1dcfd02e713e447a9107c91 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 5 Dec 2023 14:45:33 -0500 Subject: [PATCH 054/111] Add files via upload --- data/notebooks/Archive.zip | Bin 0 -> 10668 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/notebooks/Archive.zip diff --git a/data/notebooks/Archive.zip b/data/notebooks/Archive.zip new file mode 100644 index 0000000000000000000000000000000000000000..223442167e114f853482a517106bdfeac3221517 GIT binary patch literal 10668 zcmb_?1#BMclI3q^J7#A3&CJZq%*@OXGcz;A?3me(nPO&Uh?$w~#FNa-oxStst@LKK z)oQ6hUDd5Fo$plDQIr7#M*~3r-eTQCH30utNwR z@93qbiU5Gh_H{5T`n?zY-rYRm03hHGAOOH`M*u)q#(3Nw^{22WFW7zk?}>BvlPx8* zapq1f5?N4n5wWNgE1ChH+u5=_1P1Z9lU!-CY(XK={Cy$V4^OO4!Y$Wb3M@T@b#o&H z8&H}wQ0N}@f7;>n>97LFiti%O7-(L?>R1m$5mSjY6H#QOR9APqp^!_?`?SYdh^3L- zd{NF0UHbX4!;-@W%ZG(g433-|%cX*~Wji{#lTYVd*2?)RBEj7?mA3Gip*hyO>_5Hd z%`gN=%|H>(Fl>OtvIyZwh+rr7LgK?1cLwcPbL0onCP+Ko1X>0Ii;MP#!ih>b zu#z+7NG}A<`o5|%vU@P!*J9?gIy7A2luX@SnI3;B1UBd$!x5}S6KtCfj(;U|b>@bW z3Sl)Y;FclZE8}IcgFtKKi|TdP_vOA1=ipPZE_@ zJX}DfkcwGtbLgaRjfvrkcNy-LV~E_Av`VkOS~yQkE==Yb^8g6*lsjr5CrrtC+a>&j z#oq3^-(04GAcUMv*5XYZMq4f1(|Lxy-xHo(il*tSi@~0yE+w;YIgmd?FCId=&Nlfl>F1t;_>HQP(OTs8osXy?XP(@ZYBu(M_3Y ze$HF{Fbf)88N|0;?-{eku*g6ZcB@$ysahdQ^_P7a8;iP`JhL&|-ML1X#F(IcT{tV- zl^D-vw9YsjYuzW!)KHh*h1G;CSmr1=gZ!K@8`V!kE2iUp#=LvclpWU473O_EbKLC5-fZKBr451b+Avp5@qkwldNIebtA-&!7)fN@$_VWw?*=1MIzDplxSwJ$jC z9$p+96g5n~8T0HUpW!dI2Bqs4EJpocKLjm$f{`&2qkCXo{-q|CHH|~jO!qJDtaiun z0X0O@bR{}zr+|E0$S;P0L zh1Pi=Q`n{zQ48!eH#{GBshE8ZqG)TU+RF6dn4Uzqrb6g!Sg>tqSiqz-%hZz0VDdcv za&NO`(1i98F+U^x>(@M6;*~f)obGbFq4)mkoeMa$lgQ>ageG*j?Sg(XBocg;B0uU9 zp57q%U32eA7bk4Et?Ot$zcqHA;|B-@+l;imVp;2?Anx5yRD1#7x8k z9K?bB1dVr~5Rd=IZkrcfSRNI(`C-%;Yl+PomN?wLzoHBjv^<~jlqWa<@CUvW{~g%= z;Mw1S?aw*nZ(wtCwX$_#_%AH0{*7a0{}app!^b~@qZ`@p2><27)xp5T#r^NVnDg7w zAAtM^U|iJFaoB4|_Is)|T!hk!jc-_$0MXe2ShOC?z_J2gcW^-IX5&z_&_vf(7q0yU z2BN2hnCGt_-7|SBYpbd&Q-FnG1AmKY6BSs1&wlld^WRFT8(^ zxX-Ccv{<5RoJ@auSWt6zHZaG9IM-?Dv<@kj>$`2)lUBsCh&fHjRO$VJs#U}40ahxV zFze*f?TNdD4Uka{DrGApKUUzqOo>ftNpL&p)E!be_dHPTL~%qwL^lot^~zDtl52P7 z4#qfo!2TLgH0v(-(pkZpLl|V_;Dgea7kQI?dv(5B@s4pkZ6xq5SV6@ z5wq$ps1UkUA2vNPKB^elN{G*TLZXHBn3m~BctRv2 zn4(=1xiD;NMOU&(Aj!5XjAY=C^aG^|$06(PKHF0d)14)^zXpR5#(h6lA0%?Ly}?2% z>H&|gu5Vms`NVjR>~-I5m!61G$`FySt5{_9rL~_t|io|1apucKN;6! zmAUu0zrWDE&nLnpJGyDI*pk5YE2fAa!okpRg{6*?%W6f#>RjIILCLGxfdKc{!l%+( zXJxf-9(+H19SuVf)S52=QrQVcE?Gvk#`uMrt?l=Te9>OJn9hX4GsdDC1Ulq_g)|UZ z0`{&z^jGmRg0!2!rO%tReBrj%&-BO&mE#2073qX;W}MRTsa!|dW=~t0!>;4|of7o! zd5I!#eTZN=u& zg3T5Y$>`BqMOlLrb=K&=;-iCYkgUj8r^e~`Wsnw4P#h_pj{s_nbQmEwb)8(K$Dv1c z@0NpMEDJ)2;pEP!L1B0}p8Ar;)9Hfd5vnnGFsATHlpi%p3!jODktgTE%CY9(skCW+ zY3pkYHkOg*9Blipxx^K{j3N)*z+71w_bq7huQBRl-iovE#RP)U&b~|?_l3j5 zH;o_mCPvNbatC=(OOJrOk~veGX;eQcvz;&32Ca4M7{6x~aQdzLihpT#{{$xMhP8mT zRMygEM6+7(NlDohXqWHTd2FE{2fPFht#VQdPK8ptI9*>Vm`|f}Y|N|r9%@S+^uT5S znPi+xp5qoDN>fDRh;Hhrw}?B7CCsTOj`{;SU2=vtv>9KKeCW>M$RF6s#OP?Bui@Kg z3QC~+UdkkI+YqGKjXUAIJNX<{2(V13|5%WeI*V`_{@vxy1Ra{bFzLX;@||^(&vb z0n{w;1Dnz6=wKPMDrwMVE!x`GhAQjl{3z{3Ik*~$T04y**D_w2yXp+!mrl7Iv|20g z`PF~|f~@%DwX1=PH`02bdp+9+AJ0!0NB%=c%@y%yoFZ-Xdk$mP2GL_sJy7fHV7^O! z!M}L!VP!7Dce-`=Y=OEm&CMoQzQkI2kGSvSOiJ~;XXMp!wI!W= z%MSdOY_J{-`99C={G=HZV^i9=XA4=W;{eBy3(#1Bo zh&`=;8;rwN9{w@J7E2SH0>|!|1)-SIb($P`ry&tB-ACCYlpW{DhK#`yB=&&=sPicG z6Vpl`2yGYZTnb|(ywbI=hHiqWLyke>pa#mY3z&jJ)@?2(3Rvi8w{=n}<9$PT!NOs& z=_YPm1xFv{_>}sB-s4A=V7xA!#y4C3&oV5HL87dG`SdRJLwL< zS(f~)?85ujj(2&cLbZU!x_y5L+S{KA<#Ud;6}c4zLTWnOh&vM{{0+vazKT1JHy>Gn ziY-OYZ9SHnsrxo?yxTh-k?AWTm2eL(f$u-U-XjM zR9(uoCS9lzEs@BMLmYCdH|ohV4}-S#qlss#6#VDTQzEbKWoEbp2~rKY^?Lr{Fd=kH_)fUHeH@ zKyw(8N+bxSTf3uR{mQl;NOev^-1|!DehzaUV9tr}ri4c4?n4oTJ`$)g2H!!%U8&>x zFbsMeNf>_}ej}8X5AzGhT*tGgT?TO&kf@9KTGEqQU^tvoeWeMoNx3q3!#M{T3_ zEe@lCGW}I0G&8odzIa0SLFghl6DD+ANJ0^p8*d8~0b$pEnW6SzfcL}N93vX-8f@Ss z^x%=+AsR@VswMw{kkXTO&!yECfrnPW_5GAqS7pf9JVdWc*Y-9!hvFg~x&lVaLNac2 z6)Zg2v90io2&Y1m)lTd>I_@rA;(6JmiiY$v__`c=PE@$=!INrw3@6+WDRStcELHf@ zvruKgu%evoLi7<0DB%y6^C(RWrM}RwOmA|hxX2Eb`9VW3gQ~gNl;g=8yP(=z(Me-T z+KiAGEVvy+9zUYyBRg^yp{I{K*%F%1VWlYL9hehNG2$7=i!4>q(82E(&($KLZ(BVM zl0fOC9K7tVJ;I+^N@`T5!jE~LB)s9EcyJzTC)|`0tY>EovlYzwr^jejwr488)MqE5 z5OCy@Kq}efS%S#rOVy|HmU?@IbS-qO7}%WL(2`U6;rLOWhG4K%!1NvO&qGhYP*e*S zt$q*tJXIquee@nGZJR>r3cVm0f1Mgdis9UEmipN}fYivpy89w;N1jk>di$IB75U4w zh>HWd9Kn&~3Qn&dblKOz?TEV~ZlWt1Lfyf93KZ4lYnVKTi_O@VyysXlq3ZvwfG?#n|G32+H5I$4OY7vhn02TQ#tM2q_o?` zDr|-Clh8DK%=GE2QN?_{?G>`8og_Wuv$EO^PA{f&d4NTc_gJE#ZTg$mK9SrOZ-&(J zp~ZANPn?tQKVN6m-8LLL{iVX7F`UL=LGHMCq5E(`JSC}<+CM=4piDR9Y}geT0PqJ( z{yEA>>itJM=Rc|*7e`ww*Zx7rF`#do3tEM=IA=NJ&P^^0kn2DjtE4380(& zox!-!4Wc|7BS}Qc*>b@?PaGmqNkFCBFPxjUB*cQYPUf6=zsxyPhDT~|Jlhd#R`r+(p+MyH8ep7!Uia5@uOFSqj|3HhSXWYRIljWgG{SR~=eJ-&-7@tO z+KD<2d*+wPxb}^o_hcdcL5FbFkV0-OemVahXCWkMA&HGve?9*ea;WhoC}Un3Qh$Vd z8q`UqCW6J_`$-x@juoBtDJ@>|HxvZCMT=yzf~Q~kvfG*JRhtA0&sO`GIfr;5RrcP{ zEneEDN&N_g_%h#kNH6^p$L;DZX?&!X$547rH4h7!ZRCSTph{#K?KNe8lC7{+A95n3 zMsL!O5T+#|QKZ|hYGi%-WJr!FsO{T#mKRts7ek@XhB@D^06sosn&1sd5`b)bxKhrM z=@N->boD(GtBDI;EKlaBjoNR)^%&;3Ze~B21@5CMJaLw`DP~HKf&DN0DLP8GZ zNf(C~i6}Gus7c8+s+6)2JJ3}gXDlVxFXclsOOtzjeca%8GVc@c0g1(&p(bGpl-*dt zWLo6JVlHo9^SSfY;o9sG0^V|9qM&!k!FTqMIako6fZ;%8a#%*>dc~cmfNxkpT{J1I zgbn3>A&Qt&kZ$n8A(*COuJs18h(sL7*kCBHAPRMe;-HdsY4JjHGs8D-5TK`hFKgk zpTALxj5*{P;#$LK62eizZy~OMhjFf>M_Hu3<9Y4vlJWrbs;N|Gf!Zqu(e1j1vZ8;1 z!p361wJuMN%fedAD>9pOP*NV6JUWTM4E3s)c@eCG11RAo(}k8$D+uYDmOuka5yUZ!Un^qmNBp8VmHln zFa{Ivg^9G9;_z#(Dr|Y4Z9M7gu?g`xwM@;fM;zB~ZG8m4Or-n?i6;#rr}Ro(%@tF~ zz%O57AjpMf3drV}bMdNK-Qma0ScyN{NYdR^#52^U+D5bvSAQZ$bybR)Os-6VXc_i)B+_9g)OhW3v0W+Np(x|FSg9EQlJfNv1jrkX1+;_0levYR8_D zL9L*w0^%wvt|xI@RfexmtgUIPJ2O}V2OfpMrJzR@vxfLG*sa!f2Sig z=}FT*4-phw*a!M}EEDIogm{!eQN2C!P`veLjv?U_^CBVqD)KxC&gE+2%O}W7V`E|E zAr&vdDcfm+;O$)O({r&{it^NJXFAoQM}H_z#aU;L@dyJ!f^+{w2C?b1u?K8a&pv7| z_XqKJsaHmXCBhwoz~noj;J045AQhxb2eq^+#1NhAxN_o33$7rBY?GSk59bq9QH_ZU zXjG!=r4qq`G2C1F9ES7V#os{GW-f-PQMbfNacxX)*}xKrRyT%80%y_idUBM_A7K5c zj>r2cD+2<`NRsPmfEAg`)*@jrao3x04J%l;4FlxSZCm;ou`wV_K^5c~@PQrtkU&_T z;ZuQ|y1{_Djr)^#V14S`N}%FyCmZXHA4x}ixlWflzOR*gl1$uJz)|2SO~gr{(Df{W zdK@CmXtw+YV;rkEV3f-7ol((B$d81Ol%g(W_sO-o0{nWwp@)97==^r{mbIo}?MXgj zEGwwzw*R)Ss>H_Mg7fE_LDd2|7ZznKl0rQ1YPz5jAAnR@S*e5|AU+1boE- zyh*4-VFOOqG_+01j92f**>brE6CXW<67JmrJZi!)CVR$$T=EIBz`m>khqxYlsWxxruyt542ydGCz{H z`^IpDd#x|V=K^`uRFj6%(nfCaPPP^=D(IEkyGrXZ;MI7Kqs-fBu!D7b{fMGdSn>~y zrxRhm)}qchFnt_OK=+h5stCBP51nvF9Dp~Ys3usRLUIr>QoYLINN9Uxr&*t-x%2VR zFP~S;E<|!GgZ3aeN98%+8m5A;XN+T*tekF*g?^G)w(|Dq&!h-jO_=O=`fK^ zB^m9kI`^s&?);b>hu#%0LWVZ%t_m!-V=IixIJ*#VO0WM^J!tdt-f(!#<>`^TnoXCp ztU@s~GEZ8&nM~8hicYw6ZE5o&v%2t+6UI%idc>-ure*0arh~hB9Gk=az}l@rMl=1? zV+T_r;07D$b9b#0Il#2T1FJkJ)Z_nIZ=^rYEUvj##YfdME1Fx&!u#0sR(&UZ@a_qTPe)2-FE$Yw}v9rT};1W zr{6svRSlqydtR~FOTo8Jn*v94Pne1SrLp`yZ#lpT84ELz&>eY23E45?;t^BeRIY>e z+k;qLoYfN<^|?E~!r`)Ag6TfY3M?B25kF~750$*(%c5T^hyYI2! zXUzaeEq9Xkv127H3yfo>iyy&(0co2j)(*>1)M<8Q=&Vz|xSWlE+Q7yF#>aJ6TtNx{ zi8Eiu*_L~*@#iHAAJ#rN^w4my_;q4j8^LN-vwO9kqy&B458*<}p;T`5Dl^S$(3dxA zJhkB_wd)1-Yf=Eg8P9c*oSJ>~NQ8Xym>{M7`$ECDWpiIUnJ(lEaz+|O=^g@3fAKs5 zT~s=HhM%wNx>NIyghweF*tf%Sp6|Dv{7~p`;HN%%1RTl}IYcjdJ?e^&;uD`V2`CerNoz3sEQ{T53*lVtMJYM(<=@0o8uCpJ`fdv5m zP~3k`e%UYp0PKG(u{|7|ZCo6UOw9g;lDb6Gz~OgE?LS>cv=22NT~_QM0zjf^cgKQs z71PNFf{G-NQ=5w|W=8uIk8uMbNvDg@QcgI+!BUYKAA?4h4Du829_Dc%^twx$= zZgOzOWV9Ax`>!y^CwV|^<FKKx zUsHcEnc>WpS#N9HW7IJIh?}abVszNj?L^Pn!hj`^t^$;CG3FS!3oPM#m!Q$Lhn6qDokI5QPQvU zE72+wBWy~(tcYW#XUCLX;82uoA&|Z%W4To&dxFelIKqcmxPZHOdm0!rtgM5 zwu|BdW^3@;Qj^jdaK{yCQ#{JKa_xsm7bo}mj$9STK?%=l75Mc*HiemK+I$O&<<&Jj z4d2bPz|EpK!Pq|Mcm;lSXkEHzz`XJSu}09MG3I1cH%8e7>DT$t9NVjO; zY=pZ3I3l^d$*^+MoZiCBI5a_)uA@0X(ozXK4=zNXJNi9<`J^f^8Kxe&-lSbe!h$|2eHGX z>087>jT%hF6Kr4FKE(DTbg4~21;yGQKxU&YG98uGJR>E9T_w-w1BWBO-!|7>O~)$H(I!KWtUIb zz8obwqSwiZ@}?VDfF#bPT0~KV1g^8kUDCPZ)QqR&esXW{O7pfe6#GSE$cC^iz2xIN zRg-Q4`4_BBRjN{SQ0Y%4vZzx{*ZA;Aox#(-JNL`a5E-g$^b~rIyquJ7~wP;^eza(zpq+#-+ z<&QssT@Her;tK{F zYA(6M3n%eGM^00U#B?{)5L=#-2;r;T^Tm^X7FO{w0J+`IJUgYO~C&@xs1IfNu{y}dDwopHAsb>uQK@cVH=pg-_|HMlW zlfUj`WY~!T&oXL_NonTDb2eI_64!1rxa`|tUk$=f{&T$}d%G?C3!zrjklS5Z4qX0q zz0aNcNrCKB9R5La{P}Xm5*=7$1LXlH5HGnKTcJ;da8w5uNAZraZKMX9132$IO`OB( zPy6qqgVaurkOb*E6)2=X6SEcX%n^HZ()&$8i@_!>_!Cd6Z9M}K?k`SYKK^~N#MlXE z8iE7>{w#_Ae6e)>efQ6N{CD&D?<-^F?{Rj;{~T$Ds`c5gGs1Tr(Li5)f{OkL0^tLM zJ#QKaK)`ZGQPq)O7)hd(c6oauHREw!tDcl9%^N8zlMqKLx&tW|h@C51d)Pww6wI#K zsy~|leg7RoVwHM^pU%bptup+-VR)~iN;eIOE zTYiqC_N3bfh!5n*kxNbKO56JD!Xoa&(0lHekGu=Y7o24Ar!U4GIJ_^Q#Ns4866J1498 z^`PjvgI`26H6ou!vwr{Nsi5_M4H?L-Oih3Wq&|8MnEoae zXrZSLjbFNO_CxQs1bhYuQl->8NAJE(baVB)Zgj5QERjgG2+4=llf!4z2qee5KUg5P z*!v($am?DZqmq6l4&5;#24_V;6TT)4`ZRc+^0IC8g!zXgQVZ_D6oUc)f3jOq1_Trh z?EgsK{;nDMBYFGFiUa=~{XYoc{tEEt@qZ>g`E&h`T?noIma6=#LX`jh(tjT;{gI;l zWyOK`5P$jnmxU>Rz3^W;m%sOu|IDWTvf@B1guk5sf0$JLW7xkl`TzbSrjh>&`@d>k z|6_o^qu-x`;4do Date: Tue, 5 Dec 2023 16:41:58 -0500 Subject: [PATCH 055/111] Add files via upload --- data/notebooks/Clusters_Scout.py | 197 +++++++++++++++++++++++++++ data/notebooks/Metastore_Analysis.py | 106 ++++++++++++++ data/notebooks/Metastore_Scout.py | 88 ++++++++++++ 3 files changed, 391 insertions(+) create mode 100644 data/notebooks/Clusters_Scout.py create mode 100644 data/notebooks/Metastore_Analysis.py create mode 100644 data/notebooks/Metastore_Scout.py diff --git a/data/notebooks/Clusters_Scout.py b/data/notebooks/Clusters_Scout.py new file mode 100644 index 00000000..b3befd3f --- /dev/null +++ b/data/notebooks/Clusters_Scout.py @@ -0,0 +1,197 @@ +# Databricks notebook source +# Databricks notebook source +import json, os, datetime, requests +import requests.packages.urllib3 + +global pprint_j + +requests.packages.urllib3.disable_warnings() + + +# Helper to pretty print json +def pprint_j(i): + print(json.dumps(i, indent=4, sort_keys=True)) + + +class dbclient: + """ + Rest API Wrapper for Databricks APIs + """ + # set of http error codes to throw an exception if hit. Handles client and auth errors + http_error_codes = (401, 403) + + def __init__(self, token, url): + self._token = {'Authorization': 'Bearer {0}'.format(token)} + self._url = url.rstrip("/") + self._is_verbose = False + self._verify_ssl = False + if self._verify_ssl: + # set these env variables if skip SSL verification is enabled + os.environ['REQUESTS_CA_BUNDLE'] = "" + os.environ['CURL_CA_BUNDLE'] = "" + + def is_aws(self): + return self._is_aws + + def is_verbose(self): + return self._is_verbose + + def is_skip_failed(self): + return self._skip_failed + + def test_connection(self): + # verify the proper url settings to configure this client + if self._url[-4:] != '.com' and self._url[-4:] != '.net': + print("Hostname should end in '.com'") + return -1 + results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, + verify=self._verify_ssl) + http_status_code = results.status_code + if http_status_code != 200: + print("Error. Either the credentials have expired or the credentials don't have proper permissions.") + print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") + print(results.text) + return -1 + return 0 + + def get(self, endpoint, json_params=None, version='2.0', print_json=False): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("Get: {0}".format(full_endpoint)) + if json_params: + raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + else: + raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + if type(results) == list: + results = {'elements': results} + results['http_status_code'] = raw_results.status_code + return results + + def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("{0}: {1}".format(http_type, full_endpoint)) + if json_params: + if http_type == 'post': + if files_json: + raw_results = requests.post(full_endpoint, headers=self._token, + data=json_params, files=files_json, verify=self._verify_ssl) + else: + raw_results = requests.post(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'put': + raw_results = requests.put(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'patch': + raw_results = requests.patch(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, + http_status_code, + raw_results.text)) + results = raw_results.json() + else: + print("Must have a payload in json_args param.") + return {} + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + # if results are empty, let's return the return status + if results: + results['http_status_code'] = raw_results.status_code + return results + else: + return {'http_status_code': raw_results.status_code} + + def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): + return self.http_req('post', endpoint, json_params, version, print_json, files_json) + + def put(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('put', endpoint, json_params, version, print_json) + + def patch(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('patch', endpoint, json_params, version, print_json) + + @staticmethod + def my_map(F, items): + to_return = [] + for elem in items: + to_return.append(F(elem)) + return to_return + + def set_export_dir(self, dir_location): + self._export_dir = dir_location + + def get_export_dir(self): + return self._export_dir + + def get_latest_spark_version(self): + versions = self.get('/clusters/spark-versions')['versions'] + v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) + for x in v_sorted: + img_type = x['key'].split('-')[1][0:5] + if img_type == 'scala': + return x + + +# COMMAND ---------- + +class discoveryClient(dbclient): + def get_clusters(self): + clusters_list = self.get('/clusters/list').get('clusters', []) + return clusters_list + + def get_num_defined_clusters(self): + clusters_list = self.clusters_list() + return len(clusters_list) + +# COMMAND ---------- + +url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) + +client = discoveryClient(token, url) + +# COMMAND ---------- + +clusters = client.get_clusters() +clusters[0] + + +# COMMAND ---------- + +cluster_details = [] +for cluster in clusters: + cluster_id = cluster['cluster_id'] + cluster_name = cluster['cluster_name'] + creator = cluster['creator_user_name'] + node_type = cluster['node_type_id'] + driver_type = cluster['driver_node_type_id'] + custom_tags = cluster['custom_tags'] + spark_version = cluster['spark_version'] + instance_profile = cluster['aws_attributes'].get('instance_profile_arn', 'No Instance Profile') + cluster_details.append((cluster_id, cluster_name, creator, node_type, driver_type, custom_tags, spark_version, instance_profile)) + +# COMMAND ---------- + +columns = ['cluster_id', 'name', 'creator', 'node_type', 'driver_type', 'custom_tags', 'spark_version', 'instance_profile'] +#spark.createDataFrame(data=jobs_details, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.clusters") + +# COMMAND ---------- + + diff --git a/data/notebooks/Metastore_Analysis.py b/data/notebooks/Metastore_Analysis.py new file mode 100644 index 00000000..8fd6450b --- /dev/null +++ b/data/notebooks/Metastore_Analysis.py @@ -0,0 +1,106 @@ +# Databricks notebook source +dfResults = (spark + .read + .table("uc_discovery.metastore") + ) + +# COMMAND ---------- + +from pyspark.sql.functions import * + +# COMMAND ---------- + +# managed table allocation across metastore + +( + dfResults + .groupBy("tableType") + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("database", split(col("tableName"), "\.")[0]) + .groupBy(["database", "tableType"]) + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("dbfs", when(col("tableLocation").contains("dbfs:"), True).otherwise(False)) + .groupBy("dbfs") + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("database", split(col("tableName"), "\.")[0]) + .withColumn("dbfs", when(col("tableLocation").contains("dbfs:"), True).otherwise(False)) + .groupBy(["database", "dbfs"]) + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .groupBy("tableProvider") + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("database", split(col("tableName"), "\.")[0]) + .groupBy(["database", "tableProvider"]) + .count() + .selectExpr("count AS value", "database", "tableProvider") + .display() +) + +# COMMAND ---------- + +( + dfResults + .groupBy("tableSize") + .count() + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("database", split(col("tableName"), "\.")[0]) + .groupBy("database") + .sum() + .selectExpr("`sum(tableSize)`/1000000000000 AS value", "database") + .display() +) + +# COMMAND ---------- + +( + dfResults + .withColumn("database", split(col("tableName"), "\.")[0]) + .groupBy(["database", "tableVersion"]) + .count() + .selectExpr("count AS value", "database", "tableVersion") + .display() +) + +# COMMAND ---------- + + diff --git a/data/notebooks/Metastore_Scout.py b/data/notebooks/Metastore_Scout.py new file mode 100644 index 00000000..545b3f0c --- /dev/null +++ b/data/notebooks/Metastore_Scout.py @@ -0,0 +1,88 @@ +# Databricks notebook source +!pip install tqdm + +# COMMAND ---------- + +from pyspark.sql.functions import * +from tqdm import tqdm + +# COMMAND ---------- + +dbutils.widgets.text("database_list", "") +database_list = dbutils.widgets.get("database_list").split(",") + +# COMMAND ---------- + +def getAllDatabases(): + databaseList = spark.sql(f"SHOW DATABASES").select("databaseName").rdd.flatMap(lambda x:x).collect() + return databaseList + +def getAllTables(database): + tableList = spark.sql(f"SHOW TABLES IN {database}").select("tableName").rdd.flatMap(lambda x:x).collect() + databaseAndTableList = [f"{database}.{t}" for t in tableList] + return databaseAndTableList + +def getTableDetail(table, detail): + try: + tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0] + except Exception as e: + tableDetail = "N/A" + return tableDetail + +def getTableSize(table): + spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN") + try: + tableSize = spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes'] + except Exception as e: + tableSize = -1 + return tableSize + +def getTableDDL(table): + tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0] + return tableDDL + +# COMMAND ---------- + +def main_scout(database_list): + + if database_list == ['all']: + database_list = getAllDatabases() + + print(f"Analyzing {len(database_list)} databases.") + fullTableList = [] + + for database in database_list: + tableList = getAllTables(database) + print(f"{database}: {len(tableList)}") + fullTableList.extend(tableList) + + print(f"Found {len(fullTableList)} in {len(database_list)} databases.") + + fullTableDetails = [] + for table in tqdm(fullTableList): + try: + tableType = getTableDetail(table, "Type") + tableLocation = getTableDetail(table, "Location") + tableProvider = getTableDetail(table, "Provider") + tableVersion = getTableDetail(table, "Created By") + tableSize = getTableSize(table) + tableDDL = getTableDDL(table) + fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL)) + except Exception as e: + print(str(e)) + continue + + columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"] + spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.metastore") + +# COMMAND ---------- + +main_scout(database_list) + +# COMMAND ---------- + +spark.read.table("uc_discovery.metastore").display() + +# COMMAND ---------- + + From 0b9fe115f7f2dad044905bc2d30b7400dcf58e52 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:27:36 -0500 Subject: [PATCH 056/111] add rename emails file --- utils/rename_emails.py | 146 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 utils/rename_emails.py diff --git a/utils/rename_emails.py b/utils/rename_emails.py new file mode 100644 index 00000000..2601d643 --- /dev/null +++ b/utils/rename_emails.py @@ -0,0 +1,146 @@ +import argparse +import os +import shutil + +def to_dict(csv_file): + """ + summary: converts a csv or text file (or another comma delim file) into a + dictionary object + + PARAMETERS: + csv_file: path file of the comma delim file, assumes that there are no column + headings, each user address is split by a new line, and the old and new + address are split by a comma in that order. + + RETURNS: + dict_from_csv: dictionary object where key is the old item and value + is new item + """ + import csv + + dict_from_csv = {} + with open(csv_file, mode='r') as f: + reader = csv.reader(f) + # assuming that each row is "old address, new address" for a user + dict_from_csv = {rows[2]:rows[6] for rows in reader} + del dict_from_csv['userName'] + return dict_from_csv + +def map(file_name, mapping): + """ + summary: reads parameter file_name and replaces all places where previous email + address is used with the new item as indicated in mapping + + PARAMETERS: + file_name: path of the file that is to be read + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + data: a text object + + """ + with open(file_name, "r") as f: + data = f.read() + #print(f"Currently mapping {file_name}") + for e in mapping: + data = data.replace(e, mapping[e]) + return data + +def write(file_name, data_write): + """ + summary: writes parameter data_write to the path indicated by parameter + file_name + + PARAMETERS: + file_name: path of the file that is to be written + data_write: text object + + RETURNS: + n/a + """ + with open(file_name, "w") as f: + f.write(data_write) + +def rename_users_folder(mapping): + """ + summary: renames the user folder by moving all files to new directory + + PARAMETERS: + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + import shutil + + users = os.listdir('./artifacts/Users') + for u in users: + if '.DS_Store' not in u: + if mapping.get(u, False): + shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+mapping[u]) + else: + shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+u) + + os.rename("./artifacts/Users", "./artifacts/EmptyDir") # this is an empty dir + os.rename("./artifacts/NewUsers", "./artifacts/Users") + + +def mapping_file(file_name, mapping): + """ + summary: maps a single file and writes it to a new file and saves the old + log file with the '_prev' suffix + + PARAMETERS: + file_name: path of the file to map + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + # this code here (directly referencing the number 4) assumes that the file name + # has the 3 letter extension (e.g. something.txt or something.csv + data = map(file_name, mapping) + write(file_name, data) + +def main(): + all_args = argparse.ArgumentParser() + all_args.add_argument("--dir", "--file", dest="directory", required=True, help='directory needs to be updated via mapping.') + all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + + args = all_args.parse_args() + file_name = args.file + mapping_file_ = args.mapping + + mapping = to_dict(mapping_file_) + print("Mapping: ") + print(mapping) + print("--------------------") + yesno = input("Confirm mapping (y/n): ") + if yesno.lower() != "y": + exit() + + # change the current working director to specified path + os.chdir(file_name) + # verify the path using getcwd() + cwd = os.getcwd() + print("Current working directory is:", cwd) + + logs = os.listdir() + + for file in logs: + # making sure we are only getting the logs + if ".log" in file: + mapping_file(file, mapping) + if "groups" == file: + groups = os.listdir("groups") + for g in groups: + mapping_file("/groups/"+g, mapping) + + + rename_users_folder(mapping) + +if __name__ == "__main__": + main() From 32dc28390b6b8d9face216563c359ca597eaa736 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:05:02 -0500 Subject: [PATCH 057/111] add files --- data/notebooks/rename_emails.py | 160 ++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 data/notebooks/rename_emails.py diff --git a/data/notebooks/rename_emails.py b/data/notebooks/rename_emails.py new file mode 100644 index 00000000..6a212dfe --- /dev/null +++ b/data/notebooks/rename_emails.py @@ -0,0 +1,160 @@ +import argparse +import os +import shutil +import csv + +def pretty_print_dict(dict_): + """ + summary: prints a dictionary object in a pretty format + + PARAMETERS: + dict_: dictionary object + + RETURNS: + n/a + """ + for key, value in dict_.items(): + print(f"{key}: {value}") + +def to_dict(csv_file, email_column='newEmail'): + """ + summary: converts a csv or text file (or another comma delim file) into a + dictionary object + + PARAMETERS: + csv_file: path file of the comma delim file, assumes that there are no column + headings, each user address is split by a new line, and the old and new + address are split by a comma in that order. + + RETURNS: + dict_from_csv: dictionary object where key is the old item and value + is new item + """ + dict_from_csv = {} + with open(csv_file, newline='', mode='r') as f: + reader = csv.DictReader(f) + for row in reader: + dict_from_csv[row['userName']] = row[email_column] + return dict_from_csv + +def map(file_name, mapping): + """ + summary: reads parameter file_name and replaces all places where previous email + address is used with the new item as indicated in mapping + + PARAMETERS: + file_name: path of the file that is to be read + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + data: a text object + + """ + with open(file_name, "r") as f: + data = f.read() + print(f" Currently mapping {file_name}") + for e in mapping: + data = data.replace(e, mapping[e]) + return data + +def write(file_name, data_write): + """ + summary: writes parameter data_write to the path indicated by parameter + file_name + + PARAMETERS: + file_name: path of the file that is to be written + data_write: text object + + RETURNS: + n/a + """ + with open(file_name, "w") as f: + f.write(data_write) + +def rename_users_folder(mapping): + """ + summary: renames the user folder by moving all files to new directory + + PARAMETERS: + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + import shutil + + users = os.listdir('./artifacts/Users') + for u in users: + if '.DS_Store' not in u: + if mapping.get(u, False): + shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+mapping[u]) + else: + shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+u) + + os.rename("./artifacts/Users", "./artifacts/EmptyDir") # this is an empty dir + os.rename("./artifacts/NewUsers", "./artifacts/Users") + + +def mapping_file(file_name, mapping): + """ + summary: maps a single file and writes it to a new file and saves the old + log file with the '_prev' suffix + + PARAMETERS: + file_name: path of the file to map + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + # this code here (directly referencing the number 4) assumes that the file name + # has the 3 letter extension (e.g. something.txt or something.csv + data = map(file_name, mapping) + write(file_name, data) + +def main(): + all_args = argparse.ArgumentParser() + all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') + all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + + args = all_args.parse_args() + file_name = args.file + mapping_file_ = args.mapping + email_column = args.column + + mapping = to_dict(mapping_file_, email_column) + print("--------------------") + pretty_print_dict(mapping) + print("--------------------") + yesno = input("Confirm mapping (y/n): ") + if yesno.lower() != "y": + exit() + + # change the current working director to specified path + os.chdir(file_name) + # verify the path using getcwd() + cwd = os.getcwd() + print("--------------------") + print("Current working directory is:", cwd) + + logs = os.listdir() + + for file in logs: + # making sure we are only getting the logs + if ".log" in file: + mapping_file(file, mapping) + if "groups" == file: + groups = os.listdir("groups") + for g in groups: + mapping_file("groups/"+g, mapping) + + + rename_users_folder(mapping) + +if __name__ == "__main__": + main() From c3512d66ff91d0bab39f8db562ce1744d7a81680 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Wed, 13 Dec 2023 11:57:42 -0500 Subject: [PATCH 058/111] add --nitro parameter --- data/nitro_mapping.csv | 151 +++++++++++++++++++++++++++++++++++++ dbclient/ClustersClient.py | 19 ++++- dbclient/JobsClient.py | 23 +++++- dbclient/parser.py | 4 + tasks/tasks.py | 4 +- 5 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 data/nitro_mapping.csv diff --git a/data/nitro_mapping.csv b/data/nitro_mapping.csv new file mode 100644 index 00000000..7015847c --- /dev/null +++ b/data/nitro_mapping.csv @@ -0,0 +1,151 @@ +PVC Instance Type,Recommended Nitro Instance Type, +m4.large,m5n.large,FALSE +m4.xlarge,m5n.xlarge,FALSE +m4.2xlarge,m5n.2xlarge,FALSE +m4.4xlarge,m5n.4xlarge,FALSE +m4.10xlarge,m5n.12xlarge,FALSE +m4.16xlarge,m5n.16xlarge,FALSE +m5.large,m5n.large,FALSE +m5.xlarge,m5n.xlarge,FALSE +m5.2xlarge,m5n.2xlarge,FALSE +m5.4xlarge,m5n.4xlarge,FALSE +m5.8xlarge,m5n.8xlarge,FALSE +m5.12xlarge,m5n.12xlarge,FALSE +m5.16xlarge,m5n.16xlarge,FALSE +m5.24xlarge,m5n.24xlarge,FALSE +m5d.large,m5dn.large,FALSE +m5d.xlarge,m5dn.xlarge,FALSE +m5d.2xlarge,m5dn.2xlarge,FALSE +m5d.4xlarge,m5dn.4xlarge,FALSE +m5d.8xlarge,m5dn.8xlarge,FALSE +m5d.12xlarge,m5dn.12xlarge,FALSE +m5d.16xlarge,m5dn.16xlarge,FALSE +m5d.24xlarge,m5dn.24xlarge,FALSE +m5a.large,m5n.large,FALSE +m5a.xlarge,m5n.xlarge,FALSE +m5a.2xlarge,m5n.2xlarge,FALSE +m5a.4xlarge,m5n.4xlarge,FALSE +m5a.8xlarge,m5n.8xlarge,FALSE +m5a.12xlarge,m5n.12xlarge,FALSE +m5a.16xlarge,m5n.16xlarge,FALSE +m5a.24xlarge,m5n.24xlarge,FALSE +m6g.large,m5n.large,FALSE +m6g.xlarge,m5n.xlarge,FALSE +m6g.2xlarge,m5n.2xlarge,FALSE +m6g.4xlarge,m5n.4xlarge,FALSE +m6g.8xlarge,m5n.8xlarge,FALSE +m6g.12xlarge,m5n.12xlarge,FALSE +m6g.16xlarge,m5n.16xlarge,FALSE +m6gd.large,m5dn.large,FALSE +m6gd.xlarge,m5dn.xlarge,FALSE +m6gd.2xlarge,m5dn.2xlarge,FALSE +m6gd.4xlarge,m5dn.4xlarge,FALSE +m6gd.8xlarge,m5dn.8xlarge,FALSE +m6gd.12xlarge,m5dn.12xlarge,FALSE +m6gd.16xlarge,m5dn.16xlarge,FALSE +c4.2xlarge,c5a.2xlarge,FALSE +c4.4xlarge,c5a.4xlarge,FALSE +c4.8xlarge,c5a.8xlarge,FALSE +c5.xlarge,c5a.xlarge,FALSE +c5.2xlarge,c5a.2xlarge,FALSE +c5.4xlarge,c5a.4xlarge,FALSE +c5.9xlarge,c5a.8xlarge,FALSE +c5.12xlarge,c5a.12xlarge,FALSE +c5.18xlarge,c5a.16xlarge,FALSE +c5.24xlarge,c5a.24xlarge,FALSE +c5d.xlarge,c5ad.xlarge,FALSE +c5d.2xlarge,c5ad.2xlarge,FALSE +c5d.4xlarge,c5ad.4xlarge,FALSE +c5d.9xlarge,c5ad.8xlarge,FALSE +c5d.12xlarge,c5ad.12xlarge,FALSE +c5d.18xlarge,c5ad.16xlarge,FALSE +c5d.24xlarge,c5ad.24xlarge,FALSE +c6g.xlarge,c5a.xlarge,FALSE +c6g.2xlarge,c5a.2xlarge,FALSE +c6g.4xlarge,c5a.4xlarge,FALSE +c6g.8xlarge,c5a.8xlarge,FALSE +c6g.12xlarge,c5a.12xlarge,FALSE +c6g.16xlarge,c5a.16xlarge,FALSE +c6gd.xlarge,c5ad.xlarge,FALSE +c6gd.2xlarge,c5ad.2xlarge,FALSE +c6gd.4xlarge,c5ad.4xlarge,FALSE +c6gd.8xlarge,c5ad.8xlarge,FALSE +c6gd.12xlarge,c5ad.12xlarge,FALSE +c6gd.16xlarge,c5ad.16xlarge,FALSE +r3.xlarge,r5n.xlarge,FALSE +r3.2xlarge,r5n.2xlarge,FALSE +r3.4xlarge,r5n.4xlarge,FALSE +r3.8xlarge,r5n.8xlarge,FALSE +r4.xlarge,r5n.xlarge,FALSE +r4.2xlarge,r5n.2xlarge,FALSE +r4.4xlarge,r5n.4xlarge,FALSE +r4.8xlarge,r5n.8xlarge,FALSE +r4.16xlarge,r5n.16xlarge,FALSE +r5.large,r5n.large,FALSE +r5.xlarge,r5n.xlarge,FALSE +r5.2xlarge,r5n.2xlarge,FALSE +r5.4xlarge,r5n.4xlarge,FALSE +r5.8xlarge,r5n.8xlarge,FALSE +r5.12xlarge,r5n.12xlarge,FALSE +r5.16xlarge,r5n.16xlarge,FALSE +r5.24xlarge,r5n.24xlarge,FALSE +r5d.large,r5dn.large,FALSE +r5d.xlarge,r5dn.xlarge,FALSE +r5d.2xlarge,r5dn.2xlarge,FALSE +r5d.4xlarge,r5dn.4xlarge,FALSE +r5d.8xlarge,r5dn.8xlarge,FALSE +r5d.12xlarge,r5dn.12xlarge,FALSE +r5d.16xlarge,r5dn.16xlarge,FALSE +r5d.24xlarge,r5dn.24xlarge,FALSE +r5a.large,r5n.large,FALSE +r5a.xlarge,r5n.xlarge,FALSE +r5a.2xlarge,r5n.2xlarge,FALSE +r5a.4xlarge,r5n.4xlarge,FALSE +r5a.8xlarge,r5n.8xlarge,FALSE +r5a.12xlarge,r5n.12xlarge,FALSE +r5a.16xlarge,r5n.16xlarge,FALSE +r5a.24xlarge,r5n.24xlarge,FALSE +r6g.large,r5n.large,FALSE +r6g.xlarge,r5n.xlarge,FALSE +r6g.2xlarge,r5n.2xlarge,FALSE +r6g.4xlarge,r5n.4xlarge,FALSE +r6g.8xlarge,r5n.8xlarge,FALSE +r6g.12xlarge,r5n.12xlarge,FALSE +r6g.16xlarge,r5n.16xlarge,FALSE +r6gd.large,r5dn.large,FALSE +r6gd.xlarge,r5dn.xlarge,FALSE +r6gd.2xlarge,r5dn.2xlarge,FALSE +r6gd.4xlarge,r5dn.4xlarge,FALSE +r6gd.8xlarge,r5dn.8xlarge,FALSE +r6gd.12xlarge,r5dn.12xlarge,FALSE +r6gd.16xlarge,r5dn.16xlarge,FALSE +i3.large,i4i.large,FALSE +i3.xlarge,i4i.xlarge,FALSE +i3.2xlarge,i4i.2xlarge,FALSE +i3.4xlarge,i4i.4xlarge,FALSE +i3.8xlarge,i4i.8xlarge,FALSE +i3.16xlarge,i4i.16xlarge,FALSE +i2.xlarge,i3en.xlarge,FALSE +i2.2xlarge,i3en.2xlarge,FALSE +i2.4xlarge,i3en.3xlarge,FALSE +i2.8xlarge,i3en.6xlarge,FALSE +p2.xlargeGPU,g4ad.4xlargeGPU,FALSE +p2.8xlargeGPU,g4ad.8xlargeGPU,FALSE +p2.16xlargeGPU,g4ad.16xlargeGPU,FALSE +p3.2xlargeGPU,g4ad.4xlargeGPU,FALSE +p3.8xlargeGPU,g4ad.8xlargeGPU,FALSE +p3.16xlargeGPU,g4ad.16xlargeGPU,FALSE +g5.xlargeGPU,g4dn.xlargeGPU,FALSE +g5.2xlargeGPU,g4dn.2xlargeGPU,FALSE +g5.4xlargeGPU,g4dn.4xlargeGPU,FALSE +g5.8xlargeGPU,g4dn.8xlargeGPU,FALSE +g5.16xlargeGPU,g4dn.12xlargeGPU,FALSE +g5.12xlargeGPU,g4dn.16xlargeGPU,FALSE +g5.24xlargeGPU,p3dn.24xlargeGPU,FALSE +g5.48xlargeGPU,p3dn.24xlargeGPU,FALSE +z1d.large,r5n.large,FALSE +z1d.xlarge,r5n.xlarge,FALSE +z1d.2xlarge,r5n.2xlarge,FALSE +z1d.3xlarge,r5n.4xlarge,FALSE +z1d.6xlarge,r5n.8xlarge,FALSE +z1d.12xlarge,r5n.12xlarge,FALSE \ No newline at end of file diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 81cbd907..5678ddda 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -1,5 +1,6 @@ import logging import os +import csv import re import time import logging_utils @@ -256,8 +257,21 @@ def get_new_policy_id_dict(self, policy_file='cluster_policies.log'): old_policy_id = policy_conf['policy_id'] policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id return policy_id_dict + + def nitro_instance_mapping(self, node_instance_type_id, driver_node_instance_type_id): + dict_from_csv = {} + real_path = os.path.dirname(os.path.realpath(__file__)) + csv_file = f'{real_path}/../data/nitro_mapping.csv' + with open(csv_file, newline='', mode='r') as f: + reader = csv.DictReader(f) + for row in reader: + dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type'] + + nitro_node_instance_id = dict_from_csv[node_instance_type_id] + nitro_driver_node_instance_id = dict_from_csv[driver_node_instance_type_id] + return nitro_node_instance_id, nitro_driver_node_instance_id - def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None): + def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None, nitro=False): """ Import cluster configs and update appropriate properties / tags in the new env :param log_file: @@ -301,6 +315,9 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus else: cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator} new_cluster_conf = cluster_conf + if nitro: + new_cluster_conf['node_type_id'], new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id'], new_cluster_conf['driver_node_type_id']) + print("Creating cluster: {0}".format(new_cluster_conf['cluster_name'])) cluster_resp = self.post('/clusters/create', new_cluster_conf) if cluster_resp['http_status_code'] == 200: diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py index 262cd028..41effcb2 100644 --- a/dbclient/JobsClient.py +++ b/dbclient/JobsClient.py @@ -1,4 +1,5 @@ import json +import csv import os import logging import logging_utils @@ -159,7 +160,19 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo 'error': message, 'json': json.dumps(x) }) - def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_map_file='job_id_map.log'): + def nitro_instance_mapping(self, instance_type_id): + dict_from_csv = {} + real_path = os.path.dirname(os.path.realpath(__file__)) + csv_file = f'{real_path}/../data/nitro_mapping.csv' + with open(csv_file, newline='', mode='r') as f: + reader = csv.DictReader(f) + for row in reader: + dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type'] + + nitro_instance_id = dict_from_csv[instance_type_id] + return nitro_instance_id + + def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_map_file='job_id_map.log', nitro=False): jobs_log = self.get_export_dir() + log_file acl_jobs_log = self.get_export_dir() + acl_file job_map_log = self.get_export_dir() + job_map_file @@ -242,7 +255,13 @@ def adjust_ids_for_cluster(settings): #job_settings or task_settings mod_task_settings.append(adjust_ids_for_cluster(task_settings)) if len(mod_task_settings) > 0: job_settings['tasks'] = mod_task_settings - + if nitro: + if 'new_cluster' in job_settings: + if 'node_type_id' in job_settings['new_cluster']: + job_settings['new_cluster']['node_type_id'] = self.nitro_instance_mapping(job_settings['new_cluster']['node_type_id']) + if 'driver_node_type_id' in job_settings['new_cluster']: + job_settings['new_cluster']['driver_node_type_id'] = self.nitro_instance_mapping(job_settings['new_cluster']['driver_node_type_id']) + logging.info(job_settings) logging.info("Current Job Name: {0}".format(job_conf['settings']['name'])) # creator can be none if the user is no longer in the org. see our docs page create_resp = self.post('/jobs/create', job_settings) diff --git a/dbclient/parser.py b/dbclient/parser.py index e795f5a9..8088da90 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -503,6 +503,10 @@ def get_pipeline_parser() -> argparse.ArgumentParser: parser.add_argument('--archive-missing', action='store_true', help='Import all missing users into the top level /Archive/ directory.') + # Cluster + Job arguments + parser.add_argument('--nitro', action='store_true', + help='Set to use Nitro cluster types for all clusters and jobs.') + # Jobs arguments parser.add_argument('--default-job-owner', action='store', default=False, help='Set a default job owner for jobs without an owner.') diff --git a/tasks/tasks.py b/tasks/tasks.py index 9d142c54..5ea2b0d0 100644 --- a/tasks/tasks.py +++ b/tasks/tasks.py @@ -226,7 +226,7 @@ def __init__(self, client_config, args, checkpoint_service, skip=False): def run(self): cl_c = ClustersClient(self.client_config, self.checkpoint_service) cl_c.import_cluster_policies() - cl_c.import_cluster_configs() + cl_c.import_cluster_configs(nitro=self.args.nitro) class InstancePoolsImportTask(AbstractTask): @@ -277,7 +277,7 @@ def __init__(self, client_config, args, checkpoint_service, skip=False): def run(self): jobs_c = JobsClient(self.client_config, self.checkpoint_service) - jobs_c.import_job_configs() + jobs_c.import_job_configs(nitro=self.args.nitro) class MetastoreExportTask(AbstractTask): From a6ec90f93e0f5072d83d7ce7cb0ae614848c0a5b Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 13 Dec 2023 15:06:03 -0500 Subject: [PATCH 059/111] deleting extraneous files --- data/notebooks/Archive.zip | Bin 10668 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 data/notebooks/Archive.zip diff --git a/data/notebooks/Archive.zip b/data/notebooks/Archive.zip deleted file mode 100644 index 223442167e114f853482a517106bdfeac3221517..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10668 zcmb_?1#BMclI3q^J7#A3&CJZq%*@OXGcz;A?3me(nPO&Uh?$w~#FNa-oxStst@LKK z)oQ6hUDd5Fo$plDQIr7#M*~3r-eTQCH30utNwR z@93qbiU5Gh_H{5T`n?zY-rYRm03hHGAOOH`M*u)q#(3Nw^{22WFW7zk?}>BvlPx8* zapq1f5?N4n5wWNgE1ChH+u5=_1P1Z9lU!-CY(XK={Cy$V4^OO4!Y$Wb3M@T@b#o&H z8&H}wQ0N}@f7;>n>97LFiti%O7-(L?>R1m$5mSjY6H#QOR9APqp^!_?`?SYdh^3L- zd{NF0UHbX4!;-@W%ZG(g433-|%cX*~Wji{#lTYVd*2?)RBEj7?mA3Gip*hyO>_5Hd z%`gN=%|H>(Fl>OtvIyZwh+rr7LgK?1cLwcPbL0onCP+Ko1X>0Ii;MP#!ih>b zu#z+7NG}A<`o5|%vU@P!*J9?gIy7A2luX@SnI3;B1UBd$!x5}S6KtCfj(;U|b>@bW z3Sl)Y;FclZE8}IcgFtKKi|TdP_vOA1=ipPZE_@ zJX}DfkcwGtbLgaRjfvrkcNy-LV~E_Av`VkOS~yQkE==Yb^8g6*lsjr5CrrtC+a>&j z#oq3^-(04GAcUMv*5XYZMq4f1(|Lxy-xHo(il*tSi@~0yE+w;YIgmd?FCId=&Nlfl>F1t;_>HQP(OTs8osXy?XP(@ZYBu(M_3Y ze$HF{Fbf)88N|0;?-{eku*g6ZcB@$ysahdQ^_P7a8;iP`JhL&|-ML1X#F(IcT{tV- zl^D-vw9YsjYuzW!)KHh*h1G;CSmr1=gZ!K@8`V!kE2iUp#=LvclpWU473O_EbKLC5-fZKBr451b+Avp5@qkwldNIebtA-&!7)fN@$_VWw?*=1MIzDplxSwJ$jC z9$p+96g5n~8T0HUpW!dI2Bqs4EJpocKLjm$f{`&2qkCXo{-q|CHH|~jO!qJDtaiun z0X0O@bR{}zr+|E0$S;P0L zh1Pi=Q`n{zQ48!eH#{GBshE8ZqG)TU+RF6dn4Uzqrb6g!Sg>tqSiqz-%hZz0VDdcv za&NO`(1i98F+U^x>(@M6;*~f)obGbFq4)mkoeMa$lgQ>ageG*j?Sg(XBocg;B0uU9 zp57q%U32eA7bk4Et?Ot$zcqHA;|B-@+l;imVp;2?Anx5yRD1#7x8k z9K?bB1dVr~5Rd=IZkrcfSRNI(`C-%;Yl+PomN?wLzoHBjv^<~jlqWa<@CUvW{~g%= z;Mw1S?aw*nZ(wtCwX$_#_%AH0{*7a0{}app!^b~@qZ`@p2><27)xp5T#r^NVnDg7w zAAtM^U|iJFaoB4|_Is)|T!hk!jc-_$0MXe2ShOC?z_J2gcW^-IX5&z_&_vf(7q0yU z2BN2hnCGt_-7|SBYpbd&Q-FnG1AmKY6BSs1&wlld^WRFT8(^ zxX-Ccv{<5RoJ@auSWt6zHZaG9IM-?Dv<@kj>$`2)lUBsCh&fHjRO$VJs#U}40ahxV zFze*f?TNdD4Uka{DrGApKUUzqOo>ftNpL&p)E!be_dHPTL~%qwL^lot^~zDtl52P7 z4#qfo!2TLgH0v(-(pkZpLl|V_;Dgea7kQI?dv(5B@s4pkZ6xq5SV6@ z5wq$ps1UkUA2vNPKB^elN{G*TLZXHBn3m~BctRv2 zn4(=1xiD;NMOU&(Aj!5XjAY=C^aG^|$06(PKHF0d)14)^zXpR5#(h6lA0%?Ly}?2% z>H&|gu5Vms`NVjR>~-I5m!61G$`FySt5{_9rL~_t|io|1apucKN;6! zmAUu0zrWDE&nLnpJGyDI*pk5YE2fAa!okpRg{6*?%W6f#>RjIILCLGxfdKc{!l%+( zXJxf-9(+H19SuVf)S52=QrQVcE?Gvk#`uMrt?l=Te9>OJn9hX4GsdDC1Ulq_g)|UZ z0`{&z^jGmRg0!2!rO%tReBrj%&-BO&mE#2073qX;W}MRTsa!|dW=~t0!>;4|of7o! zd5I!#eTZN=u& zg3T5Y$>`BqMOlLrb=K&=;-iCYkgUj8r^e~`Wsnw4P#h_pj{s_nbQmEwb)8(K$Dv1c z@0NpMEDJ)2;pEP!L1B0}p8Ar;)9Hfd5vnnGFsATHlpi%p3!jODktgTE%CY9(skCW+ zY3pkYHkOg*9Blipxx^K{j3N)*z+71w_bq7huQBRl-iovE#RP)U&b~|?_l3j5 zH;o_mCPvNbatC=(OOJrOk~veGX;eQcvz;&32Ca4M7{6x~aQdzLihpT#{{$xMhP8mT zRMygEM6+7(NlDohXqWHTd2FE{2fPFht#VQdPK8ptI9*>Vm`|f}Y|N|r9%@S+^uT5S znPi+xp5qoDN>fDRh;Hhrw}?B7CCsTOj`{;SU2=vtv>9KKeCW>M$RF6s#OP?Bui@Kg z3QC~+UdkkI+YqGKjXUAIJNX<{2(V13|5%WeI*V`_{@vxy1Ra{bFzLX;@||^(&vb z0n{w;1Dnz6=wKPMDrwMVE!x`GhAQjl{3z{3Ik*~$T04y**D_w2yXp+!mrl7Iv|20g z`PF~|f~@%DwX1=PH`02bdp+9+AJ0!0NB%=c%@y%yoFZ-Xdk$mP2GL_sJy7fHV7^O! z!M}L!VP!7Dce-`=Y=OEm&CMoQzQkI2kGSvSOiJ~;XXMp!wI!W= z%MSdOY_J{-`99C={G=HZV^i9=XA4=W;{eBy3(#1Bo zh&`=;8;rwN9{w@J7E2SH0>|!|1)-SIb($P`ry&tB-ACCYlpW{DhK#`yB=&&=sPicG z6Vpl`2yGYZTnb|(ywbI=hHiqWLyke>pa#mY3z&jJ)@?2(3Rvi8w{=n}<9$PT!NOs& z=_YPm1xFv{_>}sB-s4A=V7xA!#y4C3&oV5HL87dG`SdRJLwL< zS(f~)?85ujj(2&cLbZU!x_y5L+S{KA<#Ud;6}c4zLTWnOh&vM{{0+vazKT1JHy>Gn ziY-OYZ9SHnsrxo?yxTh-k?AWTm2eL(f$u-U-XjM zR9(uoCS9lzEs@BMLmYCdH|ohV4}-S#qlss#6#VDTQzEbKWoEbp2~rKY^?Lr{Fd=kH_)fUHeH@ zKyw(8N+bxSTf3uR{mQl;NOev^-1|!DehzaUV9tr}ri4c4?n4oTJ`$)g2H!!%U8&>x zFbsMeNf>_}ej}8X5AzGhT*tGgT?TO&kf@9KTGEqQU^tvoeWeMoNx3q3!#M{T3_ zEe@lCGW}I0G&8odzIa0SLFghl6DD+ANJ0^p8*d8~0b$pEnW6SzfcL}N93vX-8f@Ss z^x%=+AsR@VswMw{kkXTO&!yECfrnPW_5GAqS7pf9JVdWc*Y-9!hvFg~x&lVaLNac2 z6)Zg2v90io2&Y1m)lTd>I_@rA;(6JmiiY$v__`c=PE@$=!INrw3@6+WDRStcELHf@ zvruKgu%evoLi7<0DB%y6^C(RWrM}RwOmA|hxX2Eb`9VW3gQ~gNl;g=8yP(=z(Me-T z+KiAGEVvy+9zUYyBRg^yp{I{K*%F%1VWlYL9hehNG2$7=i!4>q(82E(&($KLZ(BVM zl0fOC9K7tVJ;I+^N@`T5!jE~LB)s9EcyJzTC)|`0tY>EovlYzwr^jejwr488)MqE5 z5OCy@Kq}efS%S#rOVy|HmU?@IbS-qO7}%WL(2`U6;rLOWhG4K%!1NvO&qGhYP*e*S zt$q*tJXIquee@nGZJR>r3cVm0f1Mgdis9UEmipN}fYivpy89w;N1jk>di$IB75U4w zh>HWd9Kn&~3Qn&dblKOz?TEV~ZlWt1Lfyf93KZ4lYnVKTi_O@VyysXlq3ZvwfG?#n|G32+H5I$4OY7vhn02TQ#tM2q_o?` zDr|-Clh8DK%=GE2QN?_{?G>`8og_Wuv$EO^PA{f&d4NTc_gJE#ZTg$mK9SrOZ-&(J zp~ZANPn?tQKVN6m-8LLL{iVX7F`UL=LGHMCq5E(`JSC}<+CM=4piDR9Y}geT0PqJ( z{yEA>>itJM=Rc|*7e`ww*Zx7rF`#do3tEM=IA=NJ&P^^0kn2DjtE4380(& zox!-!4Wc|7BS}Qc*>b@?PaGmqNkFCBFPxjUB*cQYPUf6=zsxyPhDT~|Jlhd#R`r+(p+MyH8ep7!Uia5@uOFSqj|3HhSXWYRIljWgG{SR~=eJ-&-7@tO z+KD<2d*+wPxb}^o_hcdcL5FbFkV0-OemVahXCWkMA&HGve?9*ea;WhoC}Un3Qh$Vd z8q`UqCW6J_`$-x@juoBtDJ@>|HxvZCMT=yzf~Q~kvfG*JRhtA0&sO`GIfr;5RrcP{ zEneEDN&N_g_%h#kNH6^p$L;DZX?&!X$547rH4h7!ZRCSTph{#K?KNe8lC7{+A95n3 zMsL!O5T+#|QKZ|hYGi%-WJr!FsO{T#mKRts7ek@XhB@D^06sosn&1sd5`b)bxKhrM z=@N->boD(GtBDI;EKlaBjoNR)^%&;3Ze~B21@5CMJaLw`DP~HKf&DN0DLP8GZ zNf(C~i6}Gus7c8+s+6)2JJ3}gXDlVxFXclsOOtzjeca%8GVc@c0g1(&p(bGpl-*dt zWLo6JVlHo9^SSfY;o9sG0^V|9qM&!k!FTqMIako6fZ;%8a#%*>dc~cmfNxkpT{J1I zgbn3>A&Qt&kZ$n8A(*COuJs18h(sL7*kCBHAPRMe;-HdsY4JjHGs8D-5TK`hFKgk zpTALxj5*{P;#$LK62eizZy~OMhjFf>M_Hu3<9Y4vlJWrbs;N|Gf!Zqu(e1j1vZ8;1 z!p361wJuMN%fedAD>9pOP*NV6JUWTM4E3s)c@eCG11RAo(}k8$D+uYDmOuka5yUZ!Un^qmNBp8VmHln zFa{Ivg^9G9;_z#(Dr|Y4Z9M7gu?g`xwM@;fM;zB~ZG8m4Or-n?i6;#rr}Ro(%@tF~ zz%O57AjpMf3drV}bMdNK-Qma0ScyN{NYdR^#52^U+D5bvSAQZ$bybR)Os-6VXc_i)B+_9g)OhW3v0W+Np(x|FSg9EQlJfNv1jrkX1+;_0levYR8_D zL9L*w0^%wvt|xI@RfexmtgUIPJ2O}V2OfpMrJzR@vxfLG*sa!f2Sig z=}FT*4-phw*a!M}EEDIogm{!eQN2C!P`veLjv?U_^CBVqD)KxC&gE+2%O}W7V`E|E zAr&vdDcfm+;O$)O({r&{it^NJXFAoQM}H_z#aU;L@dyJ!f^+{w2C?b1u?K8a&pv7| z_XqKJsaHmXCBhwoz~noj;J045AQhxb2eq^+#1NhAxN_o33$7rBY?GSk59bq9QH_ZU zXjG!=r4qq`G2C1F9ES7V#os{GW-f-PQMbfNacxX)*}xKrRyT%80%y_idUBM_A7K5c zj>r2cD+2<`NRsPmfEAg`)*@jrao3x04J%l;4FlxSZCm;ou`wV_K^5c~@PQrtkU&_T z;ZuQ|y1{_Djr)^#V14S`N}%FyCmZXHA4x}ixlWflzOR*gl1$uJz)|2SO~gr{(Df{W zdK@CmXtw+YV;rkEV3f-7ol((B$d81Ol%g(W_sO-o0{nWwp@)97==^r{mbIo}?MXgj zEGwwzw*R)Ss>H_Mg7fE_LDd2|7ZznKl0rQ1YPz5jAAnR@S*e5|AU+1boE- zyh*4-VFOOqG_+01j92f**>brE6CXW<67JmrJZi!)CVR$$T=EIBz`m>khqxYlsWxxruyt542ydGCz{H z`^IpDd#x|V=K^`uRFj6%(nfCaPPP^=D(IEkyGrXZ;MI7Kqs-fBu!D7b{fMGdSn>~y zrxRhm)}qchFnt_OK=+h5stCBP51nvF9Dp~Ys3usRLUIr>QoYLINN9Uxr&*t-x%2VR zFP~S;E<|!GgZ3aeN98%+8m5A;XN+T*tekF*g?^G)w(|Dq&!h-jO_=O=`fK^ zB^m9kI`^s&?);b>hu#%0LWVZ%t_m!-V=IixIJ*#VO0WM^J!tdt-f(!#<>`^TnoXCp ztU@s~GEZ8&nM~8hicYw6ZE5o&v%2t+6UI%idc>-ure*0arh~hB9Gk=az}l@rMl=1? zV+T_r;07D$b9b#0Il#2T1FJkJ)Z_nIZ=^rYEUvj##YfdME1Fx&!u#0sR(&UZ@a_qTPe)2-FE$Yw}v9rT};1W zr{6svRSlqydtR~FOTo8Jn*v94Pne1SrLp`yZ#lpT84ELz&>eY23E45?;t^BeRIY>e z+k;qLoYfN<^|?E~!r`)Ag6TfY3M?B25kF~750$*(%c5T^hyYI2! zXUzaeEq9Xkv127H3yfo>iyy&(0co2j)(*>1)M<8Q=&Vz|xSWlE+Q7yF#>aJ6TtNx{ zi8Eiu*_L~*@#iHAAJ#rN^w4my_;q4j8^LN-vwO9kqy&B458*<}p;T`5Dl^S$(3dxA zJhkB_wd)1-Yf=Eg8P9c*oSJ>~NQ8Xym>{M7`$ECDWpiIUnJ(lEaz+|O=^g@3fAKs5 zT~s=HhM%wNx>NIyghweF*tf%Sp6|Dv{7~p`;HN%%1RTl}IYcjdJ?e^&;uD`V2`CerNoz3sEQ{T53*lVtMJYM(<=@0o8uCpJ`fdv5m zP~3k`e%UYp0PKG(u{|7|ZCo6UOw9g;lDb6Gz~OgE?LS>cv=22NT~_QM0zjf^cgKQs z71PNFf{G-NQ=5w|W=8uIk8uMbNvDg@QcgI+!BUYKAA?4h4Du829_Dc%^twx$= zZgOzOWV9Ax`>!y^CwV|^<FKKx zUsHcEnc>WpS#N9HW7IJIh?}abVszNj?L^Pn!hj`^t^$;CG3FS!3oPM#m!Q$Lhn6qDokI5QPQvU zE72+wBWy~(tcYW#XUCLX;82uoA&|Z%W4To&dxFelIKqcmxPZHOdm0!rtgM5 zwu|BdW^3@;Qj^jdaK{yCQ#{JKa_xsm7bo}mj$9STK?%=l75Mc*HiemK+I$O&<<&Jj z4d2bPz|EpK!Pq|Mcm;lSXkEHzz`XJSu}09MG3I1cH%8e7>DT$t9NVjO; zY=pZ3I3l^d$*^+MoZiCBI5a_)uA@0X(ozXK4=zNXJNi9<`J^f^8Kxe&-lSbe!h$|2eHGX z>087>jT%hF6Kr4FKE(DTbg4~21;yGQKxU&YG98uGJR>E9T_w-w1BWBO-!|7>O~)$H(I!KWtUIb zz8obwqSwiZ@}?VDfF#bPT0~KV1g^8kUDCPZ)QqR&esXW{O7pfe6#GSE$cC^iz2xIN zRg-Q4`4_BBRjN{SQ0Y%4vZzx{*ZA;Aox#(-JNL`a5E-g$^b~rIyquJ7~wP;^eza(zpq+#-+ z<&QssT@Her;tK{F zYA(6M3n%eGM^00U#B?{)5L=#-2;r;T^Tm^X7FO{w0J+`IJUgYO~C&@xs1IfNu{y}dDwopHAsb>uQK@cVH=pg-_|HMlW zlfUj`WY~!T&oXL_NonTDb2eI_64!1rxa`|tUk$=f{&T$}d%G?C3!zrjklS5Z4qX0q zz0aNcNrCKB9R5La{P}Xm5*=7$1LXlH5HGnKTcJ;da8w5uNAZraZKMX9132$IO`OB( zPy6qqgVaurkOb*E6)2=X6SEcX%n^HZ()&$8i@_!>_!Cd6Z9M}K?k`SYKK^~N#MlXE z8iE7>{w#_Ae6e)>efQ6N{CD&D?<-^F?{Rj;{~T$Ds`c5gGs1Tr(Li5)f{OkL0^tLM zJ#QKaK)`ZGQPq)O7)hd(c6oauHREw!tDcl9%^N8zlMqKLx&tW|h@C51d)Pww6wI#K zsy~|leg7RoVwHM^pU%bptup+-VR)~iN;eIOE zTYiqC_N3bfh!5n*kxNbKO56JD!Xoa&(0lHekGu=Y7o24Ar!U4GIJ_^Q#Ns4866J1498 z^`PjvgI`26H6ou!vwr{Nsi5_M4H?L-Oih3Wq&|8MnEoae zXrZSLjbFNO_CxQs1bhYuQl->8NAJE(baVB)Zgj5QERjgG2+4=llf!4z2qee5KUg5P z*!v($am?DZqmq6l4&5;#24_V;6TT)4`ZRc+^0IC8g!zXgQVZ_D6oUc)f3jOq1_Trh z?EgsK{;nDMBYFGFiUa=~{XYoc{tEEt@qZ>g`E&h`T?noIma6=#LX`jh(tjT;{gI;l zWyOK`5P$jnmxU>Rz3^W;m%sOu|IDWTvf@B1guk5sf0$JLW7xkl`TzbSrjh>&`@d>k z|6_o^qu-x`;4do Date: Wed, 13 Dec 2023 15:06:17 -0500 Subject: [PATCH 060/111] delete extra files --- data/notebooks/Clusters_Scout.py | 197 ------------------------------- 1 file changed, 197 deletions(-) delete mode 100644 data/notebooks/Clusters_Scout.py diff --git a/data/notebooks/Clusters_Scout.py b/data/notebooks/Clusters_Scout.py deleted file mode 100644 index b3befd3f..00000000 --- a/data/notebooks/Clusters_Scout.py +++ /dev/null @@ -1,197 +0,0 @@ -# Databricks notebook source -# Databricks notebook source -import json, os, datetime, requests -import requests.packages.urllib3 - -global pprint_j - -requests.packages.urllib3.disable_warnings() - - -# Helper to pretty print json -def pprint_j(i): - print(json.dumps(i, indent=4, sort_keys=True)) - - -class dbclient: - """ - Rest API Wrapper for Databricks APIs - """ - # set of http error codes to throw an exception if hit. Handles client and auth errors - http_error_codes = (401, 403) - - def __init__(self, token, url): - self._token = {'Authorization': 'Bearer {0}'.format(token)} - self._url = url.rstrip("/") - self._is_verbose = False - self._verify_ssl = False - if self._verify_ssl: - # set these env variables if skip SSL verification is enabled - os.environ['REQUESTS_CA_BUNDLE'] = "" - os.environ['CURL_CA_BUNDLE'] = "" - - def is_aws(self): - return self._is_aws - - def is_verbose(self): - return self._is_verbose - - def is_skip_failed(self): - return self._skip_failed - - def test_connection(self): - # verify the proper url settings to configure this client - if self._url[-4:] != '.com' and self._url[-4:] != '.net': - print("Hostname should end in '.com'") - return -1 - results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, - verify=self._verify_ssl) - http_status_code = results.status_code - if http_status_code != 200: - print("Error. Either the credentials have expired or the credentials don't have proper permissions.") - print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") - print(results.text) - return -1 - return 0 - - def get(self, endpoint, json_params=None, version='2.0', print_json=False): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("Get: {0}".format(full_endpoint)) - if json_params: - raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - else: - raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - if type(results) == list: - results = {'elements': results} - results['http_status_code'] = raw_results.status_code - return results - - def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("{0}: {1}".format(http_type, full_endpoint)) - if json_params: - if http_type == 'post': - if files_json: - raw_results = requests.post(full_endpoint, headers=self._token, - data=json_params, files=files_json, verify=self._verify_ssl) - else: - raw_results = requests.post(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'put': - raw_results = requests.put(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'patch': - raw_results = requests.patch(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, - http_status_code, - raw_results.text)) - results = raw_results.json() - else: - print("Must have a payload in json_args param.") - return {} - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - # if results are empty, let's return the return status - if results: - results['http_status_code'] = raw_results.status_code - return results - else: - return {'http_status_code': raw_results.status_code} - - def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): - return self.http_req('post', endpoint, json_params, version, print_json, files_json) - - def put(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('put', endpoint, json_params, version, print_json) - - def patch(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('patch', endpoint, json_params, version, print_json) - - @staticmethod - def my_map(F, items): - to_return = [] - for elem in items: - to_return.append(F(elem)) - return to_return - - def set_export_dir(self, dir_location): - self._export_dir = dir_location - - def get_export_dir(self): - return self._export_dir - - def get_latest_spark_version(self): - versions = self.get('/clusters/spark-versions')['versions'] - v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) - for x in v_sorted: - img_type = x['key'].split('-')[1][0:5] - if img_type == 'scala': - return x - - -# COMMAND ---------- - -class discoveryClient(dbclient): - def get_clusters(self): - clusters_list = self.get('/clusters/list').get('clusters', []) - return clusters_list - - def get_num_defined_clusters(self): - clusters_list = self.clusters_list() - return len(clusters_list) - -# COMMAND ---------- - -url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) -token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) - -client = discoveryClient(token, url) - -# COMMAND ---------- - -clusters = client.get_clusters() -clusters[0] - - -# COMMAND ---------- - -cluster_details = [] -for cluster in clusters: - cluster_id = cluster['cluster_id'] - cluster_name = cluster['cluster_name'] - creator = cluster['creator_user_name'] - node_type = cluster['node_type_id'] - driver_type = cluster['driver_node_type_id'] - custom_tags = cluster['custom_tags'] - spark_version = cluster['spark_version'] - instance_profile = cluster['aws_attributes'].get('instance_profile_arn', 'No Instance Profile') - cluster_details.append((cluster_id, cluster_name, creator, node_type, driver_type, custom_tags, spark_version, instance_profile)) - -# COMMAND ---------- - -columns = ['cluster_id', 'name', 'creator', 'node_type', 'driver_type', 'custom_tags', 'spark_version', 'instance_profile'] -#spark.createDataFrame(data=jobs_details, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.clusters") - -# COMMAND ---------- - - From 009bb5c48d04750cef45a7a40b56a0594a64fd02 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 13 Dec 2023 15:08:15 -0500 Subject: [PATCH 061/111] delete extra files --- data/notebooks/Metastore_Analysis.py | 106 --------------------------- 1 file changed, 106 deletions(-) delete mode 100644 data/notebooks/Metastore_Analysis.py diff --git a/data/notebooks/Metastore_Analysis.py b/data/notebooks/Metastore_Analysis.py deleted file mode 100644 index 8fd6450b..00000000 --- a/data/notebooks/Metastore_Analysis.py +++ /dev/null @@ -1,106 +0,0 @@ -# Databricks notebook source -dfResults = (spark - .read - .table("uc_discovery.metastore") - ) - -# COMMAND ---------- - -from pyspark.sql.functions import * - -# COMMAND ---------- - -# managed table allocation across metastore - -( - dfResults - .groupBy("tableType") - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("database", split(col("tableName"), "\.")[0]) - .groupBy(["database", "tableType"]) - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("dbfs", when(col("tableLocation").contains("dbfs:"), True).otherwise(False)) - .groupBy("dbfs") - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("database", split(col("tableName"), "\.")[0]) - .withColumn("dbfs", when(col("tableLocation").contains("dbfs:"), True).otherwise(False)) - .groupBy(["database", "dbfs"]) - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .groupBy("tableProvider") - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("database", split(col("tableName"), "\.")[0]) - .groupBy(["database", "tableProvider"]) - .count() - .selectExpr("count AS value", "database", "tableProvider") - .display() -) - -# COMMAND ---------- - -( - dfResults - .groupBy("tableSize") - .count() - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("database", split(col("tableName"), "\.")[0]) - .groupBy("database") - .sum() - .selectExpr("`sum(tableSize)`/1000000000000 AS value", "database") - .display() -) - -# COMMAND ---------- - -( - dfResults - .withColumn("database", split(col("tableName"), "\.")[0]) - .groupBy(["database", "tableVersion"]) - .count() - .selectExpr("count AS value", "database", "tableVersion") - .display() -) - -# COMMAND ---------- - - From 7b0f1bc55191f96225c7231231ce3b7d775c79c5 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 13 Dec 2023 15:10:06 -0500 Subject: [PATCH 062/111] delete extra files --- data/notebooks/Metastore_Scout.py | 88 ------------------------------- 1 file changed, 88 deletions(-) delete mode 100644 data/notebooks/Metastore_Scout.py diff --git a/data/notebooks/Metastore_Scout.py b/data/notebooks/Metastore_Scout.py deleted file mode 100644 index 545b3f0c..00000000 --- a/data/notebooks/Metastore_Scout.py +++ /dev/null @@ -1,88 +0,0 @@ -# Databricks notebook source -!pip install tqdm - -# COMMAND ---------- - -from pyspark.sql.functions import * -from tqdm import tqdm - -# COMMAND ---------- - -dbutils.widgets.text("database_list", "") -database_list = dbutils.widgets.get("database_list").split(",") - -# COMMAND ---------- - -def getAllDatabases(): - databaseList = spark.sql(f"SHOW DATABASES").select("databaseName").rdd.flatMap(lambda x:x).collect() - return databaseList - -def getAllTables(database): - tableList = spark.sql(f"SHOW TABLES IN {database}").select("tableName").rdd.flatMap(lambda x:x).collect() - databaseAndTableList = [f"{database}.{t}" for t in tableList] - return databaseAndTableList - -def getTableDetail(table, detail): - try: - tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0] - except Exception as e: - tableDetail = "N/A" - return tableDetail - -def getTableSize(table): - spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN") - try: - tableSize = spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes'] - except Exception as e: - tableSize = -1 - return tableSize - -def getTableDDL(table): - tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0] - return tableDDL - -# COMMAND ---------- - -def main_scout(database_list): - - if database_list == ['all']: - database_list = getAllDatabases() - - print(f"Analyzing {len(database_list)} databases.") - fullTableList = [] - - for database in database_list: - tableList = getAllTables(database) - print(f"{database}: {len(tableList)}") - fullTableList.extend(tableList) - - print(f"Found {len(fullTableList)} in {len(database_list)} databases.") - - fullTableDetails = [] - for table in tqdm(fullTableList): - try: - tableType = getTableDetail(table, "Type") - tableLocation = getTableDetail(table, "Location") - tableProvider = getTableDetail(table, "Provider") - tableVersion = getTableDetail(table, "Created By") - tableSize = getTableSize(table) - tableDDL = getTableDDL(table) - fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL)) - except Exception as e: - print(str(e)) - continue - - columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"] - spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.metastore") - -# COMMAND ---------- - -main_scout(database_list) - -# COMMAND ---------- - -spark.read.table("uc_discovery.metastore").display() - -# COMMAND ---------- - - From 0b67c31c4fd7430bab64ec7d513b5658563f2507 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Thu, 14 Dec 2023 15:35:25 -0500 Subject: [PATCH 063/111] update dbclient/ClustersClient.py to update nitro --- dbclient/ClustersClient.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 5678ddda..203a6cce 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -258,7 +258,7 @@ def get_new_policy_id_dict(self, policy_file='cluster_policies.log'): policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id return policy_id_dict - def nitro_instance_mapping(self, node_instance_type_id, driver_node_instance_type_id): + def nitro_instance_mapping(self, instance_type_id): dict_from_csv = {} real_path = os.path.dirname(os.path.realpath(__file__)) csv_file = f'{real_path}/../data/nitro_mapping.csv' @@ -267,9 +267,8 @@ def nitro_instance_mapping(self, node_instance_type_id, driver_node_instance_typ for row in reader: dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type'] - nitro_node_instance_id = dict_from_csv[node_instance_type_id] - nitro_driver_node_instance_id = dict_from_csv[driver_node_instance_type_id] - return nitro_node_instance_id, nitro_driver_node_instance_id + nitro_instance_type_id = dict_from_csv[instance_type_id] + return nitro_instance_type_id def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None, nitro=False): """ @@ -316,7 +315,10 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator} new_cluster_conf = cluster_conf if nitro: - new_cluster_conf['node_type_id'], new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id'], new_cluster_conf['driver_node_type_id']) + if 'node_type_id' in new_cluster_conf: + new_cluster_conf['node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id']) + if 'driver_node_type_id' in new_cluster_conf: + new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['driver_node_type_id']) print("Creating cluster: {0}".format(new_cluster_conf['cluster_name'])) cluster_resp = self.post('/clusters/create', new_cluster_conf) From 2e980cfb2f50a3dc0cacdda3caa4abf90832731f Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Fri, 5 Jan 2024 14:06:16 -0500 Subject: [PATCH 064/111] adding replace group scripts --- data/notebooks/replace_groups.py | 157 +++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 data/notebooks/replace_groups.py diff --git a/data/notebooks/replace_groups.py b/data/notebooks/replace_groups.py new file mode 100644 index 00000000..abd51f39 --- /dev/null +++ b/data/notebooks/replace_groups.py @@ -0,0 +1,157 @@ +import argparse +import os +import csv + +def pretty_print_dict(dict_): + """ + summary: prints a dictionary object in a pretty format + + PARAMETERS: + dict_: dictionary object + + RETURNS: + n/a + """ + for key, value in dict_.items(): + print(f"{key}: {value}") + +def to_dict(csv_file, email_column='newEmail'): + """ + summary: converts a csv or text file (or another comma delim file) into a + dictionary object + + PARAMETERS: + csv_file: path file of the comma delim file, assumes that there are no column + headings, each user address is split by a new line, and the old and new + address are split by a comma in that order. + + RETURNS: + dict_from_csv: dictionary object where key is the old item and value + is new item + """ + dict_from_csv = {} + with open(csv_file, newline='', mode='r') as f: + reader = csv.DictReader(f) + for row in reader: + dict_from_csv[row['group_name']] = row[email_column] + return dict_from_csv + +def map(file_name, mapping): + """ + summary: reads parameter file_name and replaces all places where previous email + address is used with the new item as indicated in mapping + + PARAMETERS: + file_name: path of the file that is to be read + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + data: a text object + + """ + print(f" Currently mapping {file_name}") + with open(file_name, "r") as f: + data = f.read() + for e in mapping: + data = data.replace(e, mapping[e]) + return data + +def write(file_name, data_write): + """ + summary: writes parameter data_write to the path indicated by parameter + file_name + + PARAMETERS: + file_name: path of the file that is to be written + data_write: text object + + RETURNS: + n/a + """ + with open(file_name, "w") as f: + f.write(data_write) + + +def mapping_file(file_name, mapping): + """ + summary: maps a single file and writes it to a new file and saves the old + log file with the '_prev' suffix + + PARAMETERS: + file_name: path of the file to map + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + # this code here (directly referencing the number 4) assumes that the file name + # has the 3 letter extension (e.g. something.txt or something.csv + data = map(file_name, mapping) + write(file_name, data) + +def rename_group_file(mapping): + """ + summary: renames the user folder by moving all files to new directory + + PARAMETERS: + mapping: dict where key is the previous item and value is the + new item + + RETURNS: + n/a + """ + groups = os.listdir('groups') + for g in groups: + if '.DS_Store' in g: + continue + if mapping.get(g, False): + os.rename("groups/"+g, "groups/"+mapping[g]) + +def main(): + all_args = argparse.ArgumentParser() + all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') + #all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + #all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + + args = all_args.parse_args() + file_name = args.file + #mapping_file_ = args.mapping + #email_column = args.column + + #mapping = to_dict(mapping_file_, email_column) + mapping = {"old_group_name": "new_group_name"} + print("--------------------") + pretty_print_dict(mapping) + print("--------------------") + yesno = input("Confirm mapping (y/n): ") + if yesno.lower() != "y": + exit() + + # change the current working director to specified path + os.chdir(file_name) + # verify the path using getcwd() + cwd = os.getcwd() + print("--------------------") + print("Current working directory is:", cwd) + + logs = os.listdir() + + for file in logs: + if '.DS_Store' in file: + continue + # making sure we are only getting the logs + if ".log" in file: + mapping_file(file, mapping) + if "groups" == file: + groups = os.listdir("groups") + for g in groups: + if '.DS_Store' in g: + continue + mapping_file("groups/"+g, mapping) + + rename_group_file(mapping) + +if __name__ == "__main__": + main() From 444c7cb897000d48a2d216b95692661b96c701f9 Mon Sep 17 00:00:00 2001 From: veenaramesh Date: Fri, 5 Jan 2024 14:12:55 -0500 Subject: [PATCH 065/111] wrong push! new push --- data/notebooks/replace_groups.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/data/notebooks/replace_groups.py b/data/notebooks/replace_groups.py index abd51f39..f577a46e 100644 --- a/data/notebooks/replace_groups.py +++ b/data/notebooks/replace_groups.py @@ -112,15 +112,15 @@ def rename_group_file(mapping): def main(): all_args = argparse.ArgumentParser() all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') - #all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') - #all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') args = all_args.parse_args() file_name = args.file - #mapping_file_ = args.mapping - #email_column = args.column + mapping_file_ = args.mapping + email_column = args.column - #mapping = to_dict(mapping_file_, email_column) + mapping = to_dict(mapping_file_, email_column) mapping = {"old_group_name": "new_group_name"} print("--------------------") pretty_print_dict(mapping) From 68c6aeee606e63f77cb94152e38d2bce864be9d6 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:39:22 -0500 Subject: [PATCH 066/111] delete notebooks! --- data/notebooks/delete_clusters.py | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 data/notebooks/delete_clusters.py diff --git a/data/notebooks/delete_clusters.py b/data/notebooks/delete_clusters.py new file mode 100644 index 00000000..91853d61 --- /dev/null +++ b/data/notebooks/delete_clusters.py @@ -0,0 +1,32 @@ +import requests + +# This script will delete all clusters in a Databricks workspace +# Set the Databricks API endpoint and access token + +DATABRICKS_INSTANCE = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) + +# Set the API endpoint and access token +api_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/list" +access_token = DATABRICKS_TOKEN + +# Send a GET request to retrieve the list of clusters +response = requests.get(api_endpoint, headers={"Authorization": f"Bearer {access_token}"}) + +# Check if the request was successful +if response.status_code == 200: + clusters = response.json()["clusters"] + print(f"Found {len(clusters)} clusters") + # Delete each cluster + for cluster in clusters: + cluster_id = cluster["cluster_id"] + delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/delete?cluster_id={cluster_id}" + delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) + + # Check if the cluster deletion was successful + if delete_response.status_code == 200: + print(f"Cluster {cluster_id} deleted successfully") + else: + print(f"Failed to delete cluster {cluster_id}") +else: + print("Failed to retrieve the list of clusters") From 4805188c7e1fbefd58bab8f0313b96f725f4059d Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 9 Jan 2024 14:51:25 -0500 Subject: [PATCH 067/111] Update delete_clusters.py - unpins now as well --- data/notebooks/delete_clusters.py | 34 +++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/data/notebooks/delete_clusters.py b/data/notebooks/delete_clusters.py index 91853d61..e731f102 100644 --- a/data/notebooks/delete_clusters.py +++ b/data/notebooks/delete_clusters.py @@ -3,6 +3,9 @@ # This script will delete all clusters in a Databricks workspace # Set the Databricks API endpoint and access token +# This script will delete all clusters in a Databricks workspace +# Set the Databricks API endpoint and access token +CURRENT_CLUSTER_ID = dbutils.notebook.entry_point.getDbutils().notebook().getContext().clusterId().getOrElse(None) DATABRICKS_INSTANCE = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) @@ -20,13 +23,32 @@ # Delete each cluster for cluster in clusters: cluster_id = cluster["cluster_id"] - delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/delete?cluster_id={cluster_id}" - delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) - # Check if the cluster deletion was successful - if delete_response.status_code == 200: - print(f"Cluster {cluster_id} deleted successfully") + # Unpin the cluster + unpin_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/edit" + unpin_payload = { + "cluster_id": cluster_id, + "pinned": False + } + unpin_response = requests.post(unpin_endpoint, headers={"Authorization": f"Bearer {access_token}"}, json=unpin_payload) + + # Check if the unpin operation was successful + if unpin_response.status_code == 200: + print(f"Cluster {cluster_id} unpinned successfully") + + if cluster_id == CURRENT_CLUSTER_ID: + print(f"Skipping current cluster {cluster_id}") + continue + # Delete the cluster + delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/delete?cluster_id={cluster_id}" + delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) + + # Check if the cluster deletion was successful + if delete_response.status_code == 200: + print(f"Cluster {cluster_id} deleted successfully") + else: + print(f"Failed to delete cluster {cluster_id}") else: - print(f"Failed to delete cluster {cluster_id}") + print(f"Failed to unpin cluster {cluster_id}") else: print("Failed to retrieve the list of clusters") From fc616d3bae14c8df6cd1a12677676cfd03c35418 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 9 Jan 2024 15:03:25 -0500 Subject: [PATCH 068/111] Update delete_clusters.py --- data/notebooks/delete_clusters.py | 36 +++++++++++-------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/data/notebooks/delete_clusters.py b/data/notebooks/delete_clusters.py index e731f102..116bc221 100644 --- a/data/notebooks/delete_clusters.py +++ b/data/notebooks/delete_clusters.py @@ -24,31 +24,19 @@ for cluster in clusters: cluster_id = cluster["cluster_id"] - # Unpin the cluster - unpin_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/edit" - unpin_payload = { - "cluster_id": cluster_id, - "pinned": False - } - unpin_response = requests.post(unpin_endpoint, headers={"Authorization": f"Bearer {access_token}"}, json=unpin_payload) + print(f"Cluster {cluster_id} unpinned successfully") - # Check if the unpin operation was successful - if unpin_response.status_code == 200: - print(f"Cluster {cluster_id} unpinned successfully") - - if cluster_id == CURRENT_CLUSTER_ID: - print(f"Skipping current cluster {cluster_id}") - continue - # Delete the cluster - delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/delete?cluster_id={cluster_id}" - delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) - - # Check if the cluster deletion was successful - if delete_response.status_code == 200: - print(f"Cluster {cluster_id} deleted successfully") - else: - print(f"Failed to delete cluster {cluster_id}") + if cluster_id == CURRENT_CLUSTER_ID: + print(f"Skipping current cluster {cluster_id}") + continue + # Delete the cluster + delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters//api/2.0/clusters/permanent-delete?cluster_id={cluster_id}" + delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) + + # Check if the cluster deletion was successful + if delete_response.status_code == 200: + print(f"Cluster {cluster_id} deleted successfully") else: - print(f"Failed to unpin cluster {cluster_id}") + print(f"Failed to delete cluster {cluster_id}") else: print("Failed to retrieve the list of clusters") From d7e2db945d3470adc7b227186b420d71e610ae6f Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 11 Jan 2024 13:33:45 -0500 Subject: [PATCH 069/111] Clusters Scout to see what DBRs are being used --- data/notebooks/Clusters_Scout.py | 197 +++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 data/notebooks/Clusters_Scout.py diff --git a/data/notebooks/Clusters_Scout.py b/data/notebooks/Clusters_Scout.py new file mode 100644 index 00000000..6766a601 --- /dev/null +++ b/data/notebooks/Clusters_Scout.py @@ -0,0 +1,197 @@ +# Databricks notebook source +# Databricks notebook source +import json, os, datetime, requests +import requests.packages.urllib3 + +global pprint_j + +requests.packages.urllib3.disable_warnings() + + +# Helper to pretty print json +def pprint_j(i): + print(json.dumps(i, indent=4, sort_keys=True)) + + +class dbclient: + """ + Rest API Wrapper for Databricks APIs + """ + # set of http error codes to throw an exception if hit. Handles client and auth errors + http_error_codes = (401, 403) + + def __init__(self, token, url): + self._token = {'Authorization': 'Bearer {0}'.format(token)} + self._url = url.rstrip("/") + self._is_verbose = False + self._verify_ssl = False + if self._verify_ssl: + # set these env variables if skip SSL verification is enabled + os.environ['REQUESTS_CA_BUNDLE'] = "" + os.environ['CURL_CA_BUNDLE'] = "" + + def is_aws(self): + return self._is_aws + + def is_verbose(self): + return self._is_verbose + + def is_skip_failed(self): + return self._skip_failed + + def test_connection(self): + # verify the proper url settings to configure this client + if self._url[-4:] != '.com' and self._url[-4:] != '.net': + print("Hostname should end in '.com'") + return -1 + results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, + verify=self._verify_ssl) + http_status_code = results.status_code + if http_status_code != 200: + print("Error. Either the credentials have expired or the credentials don't have proper permissions.") + print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") + print(results.text) + return -1 + return 0 + + def get(self, endpoint, json_params=None, version='2.0', print_json=False): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("Get: {0}".format(full_endpoint)) + if json_params: + raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + else: + raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + if type(results) == list: + results = {'elements': results} + results['http_status_code'] = raw_results.status_code + return results + + def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("{0}: {1}".format(http_type, full_endpoint)) + if json_params: + if http_type == 'post': + if files_json: + raw_results = requests.post(full_endpoint, headers=self._token, + data=json_params, files=files_json, verify=self._verify_ssl) + else: + raw_results = requests.post(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'put': + raw_results = requests.put(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'patch': + raw_results = requests.patch(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, + http_status_code, + raw_results.text)) + results = raw_results.json() + else: + print("Must have a payload in json_args param.") + return {} + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + # if results are empty, let's return the return status + if results: + results['http_status_code'] = raw_results.status_code + return results + else: + return {'http_status_code': raw_results.status_code} + + def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): + return self.http_req('post', endpoint, json_params, version, print_json, files_json) + + def put(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('put', endpoint, json_params, version, print_json) + + def patch(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('patch', endpoint, json_params, version, print_json) + + @staticmethod + def my_map(F, items): + to_return = [] + for elem in items: + to_return.append(F(elem)) + return to_return + + def set_export_dir(self, dir_location): + self._export_dir = dir_location + + def get_export_dir(self): + return self._export_dir + + def get_latest_spark_version(self): + versions = self.get('/clusters/spark-versions')['versions'] + v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) + for x in v_sorted: + img_type = x['key'].split('-')[1][0:5] + if img_type == 'scala': + return x + + +# COMMAND ---------- + +class discoveryClient(dbclient): + def get_clusters(self): + clusters_list = self.get('/clusters/list').get('clusters', []) + return clusters_list + + def get_num_defined_clusters(self): + clusters_list = self.clusters_list() + return len(clusters_list) + +# COMMAND ---------- + +url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) +token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) + +client = discoveryClient(token, url) + +# COMMAND ---------- + +clusters = client.get_clusters() +clusters[0] + + +# COMMAND ---------- + +cluster_details = [] +for cluster in clusters: + cluster_id = cluster['cluster_id'] + cluster_name = cluster['cluster_name'] + creator = cluster['creator_user_name'] + node_type = cluster['node_type_id'] + driver_type = cluster['driver_node_type_id'] + custom_tags = cluster['custom_tags'] + spark_version = cluster['spark_version'] + instance_profile = cluster['aws_attributes'].get('instance_profile_arn', 'No Instance Profile') + cluster_details.append((cluster_id, cluster_name, creator, node_type, driver_type, custom_tags, spark_version, instance_profile)) + +# COMMAND ---------- + +columns = ['cluster_id', 'name', 'creator', 'node_type', 'driver_type', 'custom_tags', 'spark_version', 'instance_profile'] +spark.createDataFrame(data=cluster_details, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.clusters") + +# COMMAND ---------- + + From 8ba490bd63805b7bed9b533953f8b2fa562851f6 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:27:14 -0500 Subject: [PATCH 070/111] Update rename_emails.py Updating to make the replace more specific (not a huge find + replace). Works for service principals now as well. --- data/notebooks/rename_emails.py | 39 +++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/data/notebooks/rename_emails.py b/data/notebooks/rename_emails.py index 6a212dfe..bdc878e9 100644 --- a/data/notebooks/rename_emails.py +++ b/data/notebooks/rename_emails.py @@ -55,7 +55,19 @@ def map(file_name, mapping): data = f.read() print(f" Currently mapping {file_name}") for e in mapping: - data = data.replace(e, mapping[e]) + if "@" in mapping[e]: # this is an user + data = data.replace(f"\"user_name\": \"{e}\"", f"\"user_name\": \"{mapping[e]}\"") # in most ACLs + print(f"\"/Users/{e}/") + print(f"\"/Users/{mapping[e]}/") + data = data.replace(f"\"/Users/{e}/", f"\"/Users/{mapping[e]}/") # in notebook paths + data = data.replace(f"\"display\": \"{e}\"", f"\"display\": \"{mapping[e]}\"") # in groups + data = data.replace(f"\"userName\": \"{e}\"", f"\"userName\": \"{mapping[e]}\"") # in groups + data = data.replace(f"\"principal\": \"{e}\"", f"\"principal\": \"{mapping[e]}\"") # in secret ACLs + else: # this is a service principal + data = data.replace(f"\"user_name\": \"{e}\"", f"\"service_principal_name\": \"{mapping[e]}\"") # in most ACLs + data = data.replace(f"\"display\": \"{e}\"", f"\"display\": \"{mapping[e]}\"") # in groups + data = data.replace(f"\"principal\": \"{e}\"", f"\"principal\": \"{mapping[e]}\"") # in secret ACLs + return data def write(file_name, data_write): @@ -118,16 +130,17 @@ def mapping_file(file_name, mapping): def main(): all_args = argparse.ArgumentParser() - all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') - all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') - all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + #all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') + #all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + #all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') - args = all_args.parse_args() - file_name = args.file - mapping_file_ = args.mapping - email_column = args.column + #args = all_args.parse_args() + #file_name = args.file + #mapping_file_ = args.mapping + #email_column = args.column - mapping = to_dict(mapping_file_, email_column) + #mapping = to_dict(mapping_file_, email_column) + mapping = {"admin": "ADMIN_NEW@GMAIL.COM", "service_principal": "service_principal_id"} print("--------------------") pretty_print_dict(mapping) print("--------------------") @@ -136,7 +149,8 @@ def main(): exit() # change the current working director to specified path - os.chdir(file_name) + os.chdir("logs/session") + #os.chdir(file_name) # verify the path using getcwd() cwd = os.getcwd() print("--------------------") @@ -151,10 +165,11 @@ def main(): if "groups" == file: groups = os.listdir("groups") for g in groups: - mapping_file("groups/"+g, mapping) + if g != ".DS_Store": + mapping_file("groups/"+g, mapping) - rename_users_folder(mapping) + #rename_users_folder(mapping) if __name__ == "__main__": main() From f0a414ec1d8a47b23682aced08a979c95ea834be Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 12 Jan 2024 10:05:58 -0500 Subject: [PATCH 071/111] code to update clusters with correct IPs --- data/notebooks/patch_clusters.py | 194 +++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 data/notebooks/patch_clusters.py diff --git a/data/notebooks/patch_clusters.py b/data/notebooks/patch_clusters.py new file mode 100644 index 00000000..3a333463 --- /dev/null +++ b/data/notebooks/patch_clusters.py @@ -0,0 +1,194 @@ +import json +from datetime import datetime +import os +import requests +import argparse + +class dbclient: + """ + Rest API Wrapper for Databricks APIs + """ + # set of http error codes to throw an exception if hit. Handles client and auth errors + http_error_codes = (401, 403) + + def __init__(self, token, url): + self._token = {'Authorization': 'Bearer {0}'.format(token)} + self._url = url.rstrip("/") + self._is_verbose = False + self._verify_ssl = False + if self._verify_ssl: + # set these env variables if skip SSL verification is enabled + os.environ['REQUESTS_CA_BUNDLE'] = "" + os.environ['CURL_CA_BUNDLE'] = "" + + def is_aws(self): + return self._is_aws + + def is_verbose(self): + return self._is_verbose + + def is_skip_failed(self): + return self._skip_failed + + def test_connection(self): + # verify the proper url settings to configure this client + if self._url[-4:] != '.com' and self._url[-4:] != '.net': + print("Hostname should end in '.com'") + return -1 + results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, + verify=self._verify_ssl) + http_status_code = results.status_code + if http_status_code != 200: + print("Error. Either the credentials have expired or the credentials don't have proper permissions.") + print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") + print(results.text) + return -1 + return 0 + + def get(self, endpoint, json_params=None, version='2.0', print_json=False): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("Get: {0}".format(full_endpoint)) + if json_params: + raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + else: + raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + if type(results) == list: + results = {'elements': results} + results['http_status_code'] = raw_results.status_code + return results + + def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("{0}: {1}".format(http_type, full_endpoint)) + if json_params: + if http_type == 'post': + if files_json: + raw_results = requests.post(full_endpoint, headers=self._token, + data=json_params, files=files_json, verify=self._verify_ssl) + else: + raw_results = requests.post(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'put': + raw_results = requests.put(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'patch': + raw_results = requests.patch(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, + http_status_code, + raw_results.text)) + results = raw_results.json() + else: + print("Must have a payload in json_args param.") + return {} + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + # if results are empty, let's return the return status + if results: + results['http_status_code'] = raw_results.status_code + return results + else: + return {'http_status_code': raw_results.status_code} + + def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): + return self.http_req('post', endpoint, json_params, version, print_json, files_json) + + def put(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('put', endpoint, json_params, version, print_json) + + def patch(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('patch', endpoint, json_params, version, print_json) + +def read_log(file_name): + """ + summary: reads a given log + """ + try: + with open(file_name) as f: + data = f.read().split("\n") + return data + except FileNotFoundError as e: + return print(f"{datetime.now()} Error: {file_name} not found. ") + except Exception as e: + print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ") + print(e) + return '' + +def get_clusters_list(client): + # databricks clusters list + endpoint = "/clusters/list" + clusters_list = client.get(endpoint).get('clusters', []) + return clusters_list + +def get_clusters_ips(log_name): + data = read_log(log_name) + instance_profiles = {} + for d in data: + if len(d) != 0: + d = d.strip() + d = json.loads(d) + c_name = d.get('cluster_name', 0) + ip = d.get('aws_attributes', {}).get('instance_profile_arn', 0) + if ip != 0: + instance_profiles[c_name] = ip + return instance_profiles + +def update_cluster_ips(client, clusters_list, instance_profiles): + for c in clusters_list: + c_name = c.get('cluster_name', 0) + if c_name in instance_profiles.keys(): + c['aws_attributes']['instance_profile_arn'] = instance_profiles[c_name] + endpoint = "/clusters/edit" + json_params = c + client.patch(endpoint, json_params) + + return clusters_list + +def confirm_updated_ips(client, instance_profiles): + cnames = get_clusters_list(client) + for c in cnames: # in updated e2 clusters + c_name = c.get('cluster_name', 0) + ip = c.get('aws_attributes', {}).get('instance_profile_arn', 0) # updated ip? + if c_name in instance_profiles.keys(): + if ip != 0: + if ip != instance_profiles[c_name]: + print(f"{datetime.now()} Error: {c_name} was not updated. ") + else: + print(f"{datetime.now()} {c_name} was updated. ") + else: + print(f"{datetime.now()} Error: {c_name} was not updated. ") + else: + print(f"{datetime.now()} {c_name} did not require update. ") + +if __name__ == "__main__": + # get the arguments + parser = argparse.ArgumentParser() + parser.add_argument("--log", help="log file to read", default="logs/session/clusters.log") + parser.add_argument("--token", help="databricks token to use", default="") + parser.add_argument("--url", help="databricks url to use", default="") + args = parser.parse_args() + + client = dbclient(args.token, args.url) + cnames = get_clusters_list(client) + ips = get_clusters_ips(log_name=args.log) + update_cluster_ips(client, cnames, ips) + confirm_updated_ips(client, ips) \ No newline at end of file From ce2bb718a7a276ec2721d876a83d4d6b7172c7ea Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 12 Jan 2024 11:58:53 -0500 Subject: [PATCH 072/111] Update patch_clusters.py --- data/notebooks/patch_clusters.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data/notebooks/patch_clusters.py b/data/notebooks/patch_clusters.py index 3a333463..2a78bc61 100644 --- a/data/notebooks/patch_clusters.py +++ b/data/notebooks/patch_clusters.py @@ -159,8 +159,9 @@ def update_cluster_ips(client, clusters_list, instance_profiles): c['aws_attributes']['instance_profile_arn'] = instance_profiles[c_name] endpoint = "/clusters/edit" json_params = c - client.patch(endpoint, json_params) - + results = client.patch(endpoint, json_params) + print(f"{datetime.now()} {c_name} was updated with {instance_profiles[c_name]}. Status code: {results.get('http_status_code', 0)}") + return clusters_list def confirm_updated_ips(client, instance_profiles): @@ -191,4 +192,4 @@ def confirm_updated_ips(client, instance_profiles): cnames = get_clusters_list(client) ips = get_clusters_ips(log_name=args.log) update_cluster_ips(client, cnames, ips) - confirm_updated_ips(client, ips) \ No newline at end of file + confirm_updated_ips(client, ips) From 3cd46fd45d48e4994cc19381517f0813444ff0eb Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 12 Jan 2024 12:53:28 -0500 Subject: [PATCH 073/111] Update patch_clusters.py --- data/notebooks/patch_clusters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/notebooks/patch_clusters.py b/data/notebooks/patch_clusters.py index 2a78bc61..31e3e0e2 100644 --- a/data/notebooks/patch_clusters.py +++ b/data/notebooks/patch_clusters.py @@ -159,7 +159,8 @@ def update_cluster_ips(client, clusters_list, instance_profiles): c['aws_attributes']['instance_profile_arn'] = instance_profiles[c_name] endpoint = "/clusters/edit" json_params = c - results = client.patch(endpoint, json_params) + print(json_params) + results = client.post(endpoint, json_params) print(f"{datetime.now()} {c_name} was updated with {instance_profiles[c_name]}. Status code: {results.get('http_status_code', 0)}") return clusters_list From 91bd2be9457c45b4883c5e1a286f598a6e9b9171 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 12 Jan 2024 13:34:45 -0500 Subject: [PATCH 074/111] Update rename_emails.py --- data/notebooks/rename_emails.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/data/notebooks/rename_emails.py b/data/notebooks/rename_emails.py index bdc878e9..fbc49ca6 100644 --- a/data/notebooks/rename_emails.py +++ b/data/notebooks/rename_emails.py @@ -130,17 +130,17 @@ def mapping_file(file_name, mapping): def main(): all_args = argparse.ArgumentParser() - #all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') - #all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') - #all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') + all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') + all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') - #args = all_args.parse_args() - #file_name = args.file - #mapping_file_ = args.mapping - #email_column = args.column + args = all_args.parse_args() + file_name = args.file + mapping_file_ = args.mapping + email_column = args.column - #mapping = to_dict(mapping_file_, email_column) - mapping = {"admin": "ADMIN_NEW@GMAIL.COM", "service_principal": "service_principal_id"} + mapping = to_dict(mapping_file_, email_column) + #mapping = {"admin": "ADMIN_NEW@GMAIL.COM", "service_principal": "service_principal_id"} print("--------------------") pretty_print_dict(mapping) print("--------------------") @@ -149,8 +149,8 @@ def main(): exit() # change the current working director to specified path - os.chdir("logs/session") - #os.chdir(file_name) + #os.chdir("logs/session") + os.chdir(file_name) # verify the path using getcwd() cwd = os.getcwd() print("--------------------") From 754efb149f7a0053576fa75f45d4cf2943f96599 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 16 Jan 2024 16:07:03 -0500 Subject: [PATCH 075/111] Update patch_clusters.py --- data/notebooks/patch_clusters.py | 69 ++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/data/notebooks/patch_clusters.py b/data/notebooks/patch_clusters.py index 31e3e0e2..1eae442a 100644 --- a/data/notebooks/patch_clusters.py +++ b/data/notebooks/patch_clusters.py @@ -4,6 +4,33 @@ import requests import argparse +import warnings +warnings.filterwarnings("ignore") + +create_configs = {'num_workers', + 'autoscale', + 'cluster_name', + 'spark_version', + 'spark_conf', + 'aws_attributes', + 'node_type_id', + 'driver_node_type_id', + 'ssh_public_keys', + 'custom_tags', + 'cluster_log_conf', + 'init_scripts', + 'docker_image', + 'spark_env_vars', + 'autotermination_minutes', + 'enable_elastic_disk', + 'instance_pool_id', + 'driver_instance_pool_id', + 'policy_id', + 'pinned_by_user_name', + 'creator_user_name', + 'cluster_id', + 'data_security_mode'} + class dbclient: """ Rest API Wrapper for Databricks APIs @@ -152,18 +179,34 @@ def get_clusters_ips(log_name): instance_profiles[c_name] = ip return instance_profiles -def update_cluster_ips(client, clusters_list, instance_profiles): - for c in clusters_list: - c_name = c.get('cluster_name', 0) - if c_name in instance_profiles.keys(): - c['aws_attributes']['instance_profile_arn'] = instance_profiles[c_name] - endpoint = "/clusters/edit" - json_params = c - print(json_params) - results = client.post(endpoint, json_params) +def update_cluster_ips(client, instance_profiles): + cnames = get_clusters_list(client) + for c in cnames: + print(c) + if c.get('cluster_name', -1293) in instance_profiles.keys(): + c_name = c.get('cluster_name', 0) + c_id = c.get('cluster_id', 0) + current_cluster_json = client.get(f'/clusters/get?cluster_id={c_id}') + print(current_cluster_json) + run_properties = set(list(current_cluster_json.keys())) - create_configs + for p in run_properties: + del current_cluster_json[p] + if 'aws_attributes' in current_cluster_json: + aws_conf = current_cluster_json.pop('aws_attributes') + iam_role = instance_profiles[c_name] + aws_conf['instance_profile_arn'] = iam_role + else: + aws_conf = {} + iam_role = instance_profiles[c_name] + aws_conf['instance_profile_arn'] = iam_role + + current_cluster_json['aws_attributes'] = aws_conf + + edit_endpoint = "/clusters/edit" + results = client.post(edit_endpoint, current_cluster_json, print_json=False) print(f"{datetime.now()} {c_name} was updated with {instance_profiles[c_name]}. Status code: {results.get('http_status_code', 0)}") - - return clusters_list + + return def confirm_updated_ips(client, instance_profiles): cnames = get_clusters_list(client) @@ -190,7 +233,7 @@ def confirm_updated_ips(client, instance_profiles): args = parser.parse_args() client = dbclient(args.token, args.url) - cnames = get_clusters_list(client) + #cnames = get_clusters_list(client) ips = get_clusters_ips(log_name=args.log) - update_cluster_ips(client, cnames, ips) + update_cluster_ips(client, ips) confirm_updated_ips(client, ips) From 6f937f13b67e2740f992a37de964908770d07c72 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 17 Jan 2024 09:09:21 -0500 Subject: [PATCH 076/111] add sample_jobs filter in the future, add this to the import itself --- data/notebooks/create_sample_jobs.py | 95 ++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 data/notebooks/create_sample_jobs.py diff --git a/data/notebooks/create_sample_jobs.py b/data/notebooks/create_sample_jobs.py new file mode 100644 index 00000000..827ed1f3 --- /dev/null +++ b/data/notebooks/create_sample_jobs.py @@ -0,0 +1,95 @@ +import json +import pandas as pd +import csv +import os +import datetime +import argparse + +def read_log(file_name): + try: + with open("./all_jobs/"+file_name) as f: + data = f.read().split("\n") + return data[:-1] + except FileNotFoundError as e: + return '' + except Exception as e: + print("Error while reading file:", file_name, "\n", e) + return '' + +def move_logs(timestamp=""): + # moving all_jobs + os.rename("jobs.log", f"./all_jobs/jobs{timestamp}.log") + os.rename("acl_jobs.log", f"./all_jobs/acl_jobs{timestamp}.log") + +def write_job_log(data, sample_job_ids): + with open("jobs.log", "w") as jl: + for d in data: + try: + d = json.loads(d) + if d['job_id'] in sample_job_ids: + jl.write(json.dumps(d) + "\n") + except: + print("Error while writing jobs.log") + + +def write_job_acls_log(data, sample_job_ids): + with open("acl_jobs.log", "w") as jal: + for d in data: + try: + d = json.loads(d) + if int(d['object_id'].split("/")[-1]) in sample_job_ids: + jal.write(json.dumps(d) + "\n") + except: + print("Error while writing acl_jobs.log") + +def write_rest_job_logs(jobslog, acljobslog, sample_job_ids): + with open("other_jobs.log", "w") as ojl: + for d in jobslog: + try: + d = json.loads(d) + if d['job_id'] not in sample_job_ids: + ojl.write(json.dumps(d) + "\n") + except: + print("Error while writing other_jobs.log") + + with open("other_acl_jobs.log", "w") as ojal: + for d in acljobslog: + try: + d = json.loads(d) + if int(d['object_id'].split("/")[-1]) not in sample_job_ids: + ojal.write(json.dumps(d) + "\n") + except: + print("Error while writing other_acl_jobs.log") + +def main(): + # arguments + parser = argparse.ArgumentParser(description='Create sample jobs log') + parser.add_argument('--jobs', nargs='+', type=int, help='list of job ids to sample') + parser.add_argument('--logs', type=str, help='path to logs folder', default="logs/session/") # path to logs folder + + args = parser.parse_args() + job_ids = args.jobs + logs_path = args.logs + + os.chdir(logs_path) + + if "all_jobs" not in os.listdir(): + os.mkdir("./all_jobs/") + move_logs() + elif "jobs.log" in os.listdir(): + ts = datetime.datetime.now() + move_logs("_"+str(ts)) + + #json objects + job_log_data = read_log("jobs.log") + job_acl_log_data = read_log("acl_jobs.log") + + #move jobs.log into ./alljobs folder + write sample jobs log in main logs folder + write_job_log(job_log_data, job_ids) + write_job_acls_log(job_acl_log_data, job_ids) + + #write jobs.log that only contains jobs NOT in sample jobs log + write_rest_job_logs(job_log_data, job_acl_log_data, job_ids) + +if __name__ == "__main__": + main() From 162f11e24ed8fe4486a4ad365da79e1c18288a4c Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 17 Jan 2024 09:13:52 -0500 Subject: [PATCH 077/111] Add metastore scouts --- data/notebooks/Metastore_Scout.py | 112 +++++++++++++++++++++ data/notebooks/Metastore_Scout_no_views.py | 97 ++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 data/notebooks/Metastore_Scout.py create mode 100644 data/notebooks/Metastore_Scout_no_views.py diff --git a/data/notebooks/Metastore_Scout.py b/data/notebooks/Metastore_Scout.py new file mode 100644 index 00000000..218daea9 --- /dev/null +++ b/data/notebooks/Metastore_Scout.py @@ -0,0 +1,112 @@ +# Databricks notebook source +!pip install tqdm + +# COMMAND ---------- + +from pyspark.sql.functions import * +from tqdm import tqdm + +# COMMAND ---------- + +dbutils.widgets.text("database_list", "all") +database_list = dbutils.widgets.get("database_list").split(",") + +dbutils.widgets.text("get_ddl", "false") +getDDL = dbutils.widgets.get("get_ddl") == "true" + +dbutils.widgets.text("calculate_size", "false") +calculateSize = dbutils.widgets.get("calculate_size") == "true" + +# COMMAND ---------- + +def getAllDatabases(): + databaseList = spark.sql(f"""SHOW DATABASES""").select("databaseName").rdd.flatMap(lambda x:x).collect() + return databaseList + +def getAllTables(database): + tableList = spark.sql(f"""SHOW TABLES IN {database}""").select("tableName").rdd.flatMap(lambda x:x).collect() + databaseAndTableList = [f"{database}.{t}" for t in tableList] + return databaseAndTableList + +def getTableDetail(table, detail): + try: + tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0] + except Exception as e: + tableDetail = "N/A" + return tableDetail + +def getTableSize(table, calculateSize): + if calculateSize: + spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN") + try: + tableSize = (spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes']) + if (tableSize == None): + tableSize = int(spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == 'Statistics'").select("data_type").rdd.flatMap(lambda x:x).collect()[0].split(' ')[0]) + except Exception as e: + tableSize = -1 + else: + tableSize = -1 + return tableSize + +def getTableDDL(table, getDDL): + if getDDL: + tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0] + else: + tableDDL = "N/A" + return tableDDL + +# COMMAND ---------- + +def main_scout(database_list): + + if database_list == ['all']: + database_list = getAllDatabases() + + print(f"Analyzing {len(database_list)} databases.") + fullTableList = [] + + for database in database_list: + tableList = getAllTables(database) + print(f"{database}: {len(tableList)}") + fullTableList.extend(tableList) + + print(f"Found {len(fullTableList)} in {len(database_list)} databases.") + + fullTableDetails = [] + failedTables = [] + + for table in tqdm(fullTableList): + try: + tableType = getTableDetail(table, "Type") + tableLocation = getTableDetail(table, "Location") + tableProvider = getTableDetail(table, "Provider") + tableVersion = getTableDetail(table, "Created By") + tableSize = getTableSize(table, calculateSize) + tableDDL = getTableDDL(table, getDDL) + fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL)) + except Exception as e: + failedTables.append((table, str(e))) + continue + + columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"] + spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan") + + failedTableSchema = StructType([ + StructField("table", StringType(),True), + StructField("error", StringType(),True) + ]) + + spark.createDataFrame(data = failedTables, schema = failedTableSchema).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan_errors") + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE DATABASE IF NOT EXISTS e2_migration_testing_to_delete + +# COMMAND ---------- + +main_scout(database_list) + +# COMMAND ---------- + + diff --git a/data/notebooks/Metastore_Scout_no_views.py b/data/notebooks/Metastore_Scout_no_views.py new file mode 100644 index 00000000..a8ec31d9 --- /dev/null +++ b/data/notebooks/Metastore_Scout_no_views.py @@ -0,0 +1,97 @@ +# Databricks notebook source +!pip install tqdm + +# COMMAND ---------- + +from pyspark.sql.functions import * +from tqdm import tqdm + +# COMMAND ---------- + +dbutils.widgets.text("database_list", "") +database_list = dbutils.widgets.get("database_list").split(",") + +# COMMAND ---------- + +def getAllDatabases(): + databaseList = spark.sql(f"""SHOW DATABASES""").select("databaseName").rdd.flatMap(lambda x:x).collect() + return databaseList + +def getAllTables(database): + tableList = spark.sql(f"""SHOW TABLES IN {database}""").select("tableName").rdd.flatMap(lambda x:x).collect() + views_list = spark.sql("SHOW VIEWS FROM schema_name").select("viewName").rdd.flatMap(lambda x: x).collect() + tables_only_list = [x for x in tables_list if x not in views_list] + databaseAndTableList = [f"{database}.{t}" for t in tables_only_list] + return databaseAndTableList + +def getTableDetail(table, detail): + try: + tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0] + except Exception as e: + tableDetail = "N/A" + return tableDetail + +def getTableSize(table): + spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN") + try: + tableSize = (spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes']) + if (tableSize == None): + tableSize = int(spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == 'Statistics'").select("data_type").rdd.flatMap(lambda x:x).collect()[0].split(' ')[0]) + except Exception as e: + tableSize = -1 + return tableSize + +def getTableDDL(table): + tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0] + return tableDDL + +# COMMAND ---------- + +def main_scout(): + + if database_list == ['all']: + database_list = getAllDatabases() + + print(f"Analyzing {len(database_list)} databases.") + fullTableList = [] + + for database in database_list: + tableList = getAllTables(database) + print(f"{database}: {len(tableList)}") + fullTableList.extend(tableList) + + print(f"Found {len(fullTableList)} in {len(database_list)} databases.") + + fullTableDetails = [] + failedTables = [] + + for table in tqdm(fullTableList): + try: + tableType = getTableDetail(table, "Type") + tableLocation = getTableDetail(table, "Location") + tableProvider = getTableDetail(table, "Provider") + tableVersion = getTableDetail(table, "Created By") + tableSize = getTableSize(table) + tableDDL = getTableDDL(table) + fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL)) + except Exception as e: + failedTables.append((table, str(e))) + continue + + columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"] + spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan") + + spark.createDataFrame(data = failedTables, schema = ['table', 'error']).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan_errors") + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE DATABASE IF NOT EXISTS e2_migration_testing_to_delete + +# COMMAND ---------- + +main_scout() + +# COMMAND ---------- + + From b9760e06c3de3adac8152e8f79823e763457a649 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 17 Jan 2024 09:16:31 -0500 Subject: [PATCH 078/111] empty dir creator need to add this to workspace client import or export --- data/notebooks/empty_directory_creator.py | 126 ++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 data/notebooks/empty_directory_creator.py diff --git a/data/notebooks/empty_directory_creator.py b/data/notebooks/empty_directory_creator.py new file mode 100644 index 00000000..351a150c --- /dev/null +++ b/data/notebooks/empty_directory_creator.py @@ -0,0 +1,126 @@ +import argparse +from datetime import timedelta +import json +import time +import requests +import pandas as pd + +def _get_workspace_list(STURL, STTOKEN, path="/"): + print(f"Directories under {path}...") + requestsURL = STURL + "/api/2.0/workspace/list?path=" + requestsURL += path + headers = { + 'Authorization': f'Bearer {STTOKEN}' + } + payload = {} + print(requestsURL) + response = requests.request("GET", requestsURL, headers=headers, data=payload) + if response.status_code == 200: + try: + pathsFound = response.json()['objects'] + dirsFound = [obj for obj in pathsFound if obj.get("object_type") == "DIRECTORY"] + print(f"Found: {len(dirsFound)} directories") + return dirsFound, "Not empty" + except KeyError: + print(f"Appears that {path} is empty... Logging.") + return [], "Empty" + else: + print(response.text) + return "Failed", "Failed" + +def _make_E2_empty_directory(E2URL, E2TOKEN, path): + print(f"Making an empty directory at {path} in E2...") + requestsURL = E2URL + "/api/2.0/workspace/mkdirs" + headers = { + 'Authorization': f'Bearer {E2TOKEN}' + } + payload = {"path": path} + print(requestsURL, payload) + response = requests.request("POST", requestsURL, headers=headers, data=payload) + + if response.status_code == 200: + print(f"Successfully created empty directory at {path} in E2...") + return "Success" + else: + print(response.text) + return "Failed" + + +def _run_test_if_empty(ST, STTOKEN, E2, E2TOKEN, pathsToCheck, pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus): + next_level_dirs = [] + + for newPath in pathsToCheck: + newDirs, status = _get_workspace_list(ST, STTOKEN, newPath) + pathsChecked.append(newPath) + pathsStatus.append(status) + next_level_dirs.extend(newDirs) + + if status == "Empty": + result = _make_E2_empty_directory(E2, E2TOKEN, newPath) + pathsCreated.append(newPath) + pathsCreatedStatus.append(result) + + if len(next_level_dirs) == 0: + test_status = "Done" + else: + test_status = "Again" + + return pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus, next_level_dirs, test_status + +def main(E2, E2TOKEN, ST, STTOKEN, PATH="/"): + print("Starting empty workspace creation...") + start = time.time() + + if PATH is None: + PATH = "/" + + pathsChecked = [] + pathsStatus = [] + pathsCreated = [] + pathsCreatedStatus = [] + + dirs, status = _get_workspace_list(ST, STTOKEN, PATH) + pathsChecked.append(PATH) + pathsStatus.append(status) + + while True: + pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus, dirs, test_status = _run_test_if_empty(ST, STTOKEN, E2, E2TOKEN, dirs, pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus) + + if test_status == "Done": + print("Should end now...") + break + + modelDict = { + 'paths': pathsChecked, + 'empty_or_not': pathsStatus, + } + + print("Logging the paths checked...") + df = pd.DataFrame.from_dict(modelDict) + df.to_csv("paths_checked.csv") + print("Saved paths checked to paths_checked.csv") + + modelDict = { + 'paths': pathsCreated, + 'empty_or_not': pathsCreatedStatus, + } + + print("Logging the paths created...") + df = pd.DataFrame.from_dict(modelDict) + df.to_csv("paths_created.csv") + print("Saved paths created to paths_created.csv") + + end = time.time() + print("...Finished") + execution_time = end - start + print(f"Time script took: {timedelta(seconds=execution_time)}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Move sample jobs for an E2 Migration.") + parser.add_argument("--E2workspace", "--E2", dest="E2", help="URL to the E2 workspace") + parser.add_argument("--E2token", dest="E2TOKEN", help="E2 token for access.") + parser.add_argument("--STworkspace", "--ST", dest="ST", help="URL to the ST workspace") + parser.add_argument("--STtoken", dest="STTOKEN", help="ST token for access.") + parser.add_argument("--PATH", dest="PATH", help="Starting path, defaults to '/'. Will work recursively from there.") + parser = parser.parse_args() + main(parser.ST, parser.E2, parser.STTOKEN, parser.E2TOKEN, parser.PATH) \ No newline at end of file From 7932516d972f128aea3c16a836712e08a3bc6a85 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 17 Jan 2024 09:43:25 -0500 Subject: [PATCH 079/111] Add files via upload --- data/notebooks/DBFS File Export.py | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 data/notebooks/DBFS File Export.py diff --git a/data/notebooks/DBFS File Export.py b/data/notebooks/DBFS File Export.py new file mode 100644 index 00000000..b5295eca --- /dev/null +++ b/data/notebooks/DBFS File Export.py @@ -0,0 +1,48 @@ +# Databricks notebook source +dbutils.widgets.text("bucket","dbfs:/mnt/....","1: S3 Intermediary Bucket") +dbutils.widgets.text("dbfs","dbfs:/","2: DBFS Directory") + +# COMMAND ---------- + +from py4j.java_gateway import java_import +java_import(sc._gateway.jvm, "") + +bucket_dest_dir = dbutils.widgets.get("bucket") +dbfs_source_dir = dbutils.widgets.get("dbfs") + +print(f"Getting list of files in the source directory {dbfs_source_dir}...") + +# Get list of files in the source directory +skip_paths = ["dbfs:/mnt/", "dbfs:/databricks/", "dbfs:/databricks-datasets/","dbfs:/databricks-results/"] +files = dbutils.fs.ls(dbfs_source_dir) +print(f"Found {len(files)} in source directory.") + +# COMMAND ---------- + +# hadoop_conf = sc._jsc.hadoopConfiguration(): This line is getting the Hadoop configuration from the Java Spark Context. This configuration contains settings for Hadoop and can be used to interact with the Hadoop file system. +# hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem: This line is accessing the Hadoop FileSystem class via PySpark's JVM gateway. The FileSystem class is a generic class in Hadoop that handles file systems. +# hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path: This line is accessing the Hadoop Path class via PySpark's JVM gateway. The Path class represents file and directory paths in a Hadoop file system. + +hadoop_conf = sc._jsc.hadoopConfiguration() +hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem +hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path + +def copy_file(file): + from_path = hadoop_path(file) + to_path = hadoop_path(bucket_dest_dir) + from_fs = hadoop_fs.get(from_path.toUri(), hadoop_conf) + to_fs = hadoop_fs.get(to_path.toUri(), hadoop_conf) + print(f"Moving {from_path} to {to_path}") + sc._gateway.jvm.org.apache.hadoop.fs.FileUtil.copy(from_fs, from_path, to_fs, to_path, False, hadoop_conf) + + +# Copy each file to the destination directory +for file in files: + file_name = file.path + copy_file(file_name) + +print("All files copied to the bucket successfully!") + +# COMMAND ---------- + + From 55aed2a14ad3934cc0781ca1b68001ff7b7faeca Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:01:37 -0500 Subject: [PATCH 080/111] Create DBFS File Import --- data/notebooks/DBFS File Import | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 data/notebooks/DBFS File Import diff --git a/data/notebooks/DBFS File Import b/data/notebooks/DBFS File Import new file mode 100644 index 00000000..84adc059 --- /dev/null +++ b/data/notebooks/DBFS File Import @@ -0,0 +1,44 @@ +# Databricks notebook source +dbutils.widgets.text("bucket","dbfs:/mnt/....","1: S3 Intermediary Bucket") +dbutils.widgets.text("dbfs","dbfs:/","2: DBFS Directory") + +# COMMAND ---------- + +from py4j.java_gateway import java_import +java_import(sc._gateway.jvm, "") + +bucket_source_dir = dbutils.widgets.get("bucket") +dbfs_dest_dir = dbutils.widgets.get("dbfs") + +print(f"Getting list of files in the source directory {bucket_source_dir}...") + +# Get list of files in the source directory +files = dbutils.fs.ls(bucket_source_dir) +print(f"Found {len(files)} in source directory.") + +# COMMAND ---------- + +# hadoop_conf = sc._jsc.hadoopConfiguration(): This line is getting the Hadoop configuration from the Java Spark Context. This configuration contains settings for Hadoop and can be used to interact with the Hadoop file system. +# hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem: This line is accessing the Hadoop FileSystem class via PySpark's JVM gateway. The FileSystem class is a generic class in Hadoop that handles file systems. +# hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path: This line is accessing the Hadoop Path class via PySpark's JVM gateway. The Path class represents file and directory paths in a Hadoop file system. + +hadoop_conf = sc._jsc.hadoopConfiguration() +hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem +hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path + +def copy_file(file): + from_path = hadoop_path(file) + to_path = hadoop_path(dbfs_dest_dir) + from_fs = hadoop_fs.get(from_path.toUri(), hadoop_conf) + to_fs = hadoop_fs.get(to_path.toUri(), hadoop_conf) + print(f"Moving {from_path} to {to_path}") + sc._gateway.jvm.org.apache.hadoop.fs.FileUtil.copy(from_fs, from_path, to_fs, to_path, False, hadoop_conf) + + +# Copy each file to the destination directory +for file in files: + file_name = file.path + copy_file(file_name) + +print("All files copied to the bucket successfully!") +# COMMAND ---------- From 434899a0cda0808422a8221aaabf3441af8ef21f Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Mon, 22 Jan 2024 13:40:48 -0500 Subject: [PATCH 081/111] Update ClustersClient.py resolving bug that was not attaching instance profile to cluster when instance pools were attached --- dbclient/ClustersClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 203a6cce..63959faf 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -99,7 +99,7 @@ def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_clu if 'aws_attributes' in cluster_json: aws_conf = cluster_json.pop('aws_attributes') iam_role = aws_conf.get('instance_profile_arn', None) - if not iam_role: + if iam_role: cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role} return cluster_json From d0a12d3c7885cc4cebea38093f94bd6ab574750d Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 30 Jan 2024 11:25:02 -0500 Subject: [PATCH 082/111] Update ClustersClient.py Added fix to remove missing users + reapply ACLs --- dbclient/ClustersClient.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 63959faf..64cd2c9a 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -354,18 +354,43 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus raise ValueError(error_message) api = f'/preview/permissions/clusters/{cid}' resp = self.put(api, acl_args) + # remove user/group from ACL list if they don't exist if self.skip_missing_users: - ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"] + ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"] else: ignore_error_list = ["RESOURCE_ALREADY_EXISTS"] if logging_utils.check_error(resp, ignore_error_list): - logging_utils.log_response_error(error_logger, resp) + if resp['error_code'] == 'RESOURCE_DOES_NOT_EXIST': + resp = self.remove_missing_users(api, acl_args, resp) + if not logging_utils.log_response_error(error_logger, resp): + if 'object_id' in data: + checkpoint_cluster_configs_set.write(data['object_id']) + else: + logging_utils.log_response_error(error_logger, resp) elif 'object_id' in data: checkpoint_cluster_configs_set.write(data['object_id']) - print(resp) + def remove_missing_users(self, api, acl_args, resp): + # example message: 'Principal: UserName(x.x@email.com) does not exist' + # or 'Principal: GroupName(x.x) does not exist' + resp = self.put(api, acl_args) + while resp.get('error_code', '') == 'RESOURCE_DOES_NOT_EXIST': + if 'UserName' in resp['message']: + missing_user = re.search(r'Principal: UserName\((.*)\) does not exist', resp['message']).group(1) + logging.info(f"Removing missing user {missing_user} from ACL") + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl['user_name'] != missing_user] + resp = self.put(api, acl_args) + elif 'GroupName' in resp['message']: + missing_group = re.search(r'Principal: GroupName\((.*)\) does not exist', resp['message']).group(1) + logging.info(f"Removing missing group {missing_group} from ACL") + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl['group_name'] != missing_group] + resp = self.put(api, acl_args) + + return resp + + def _log_cluster_ids_and_original_creators( self, @@ -795,4 +820,3 @@ def wait_for_cluster(self, cid): if c_state['state'] == 'TERMINATED': raise RuntimeError("Cluster is terminated. Please check EVENT history for details") return cid - From 6def6130bc7e5c923804537791d5c08be7d32d88 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 30 Jan 2024 14:25:38 -0500 Subject: [PATCH 083/111] Update ClustersClient.py --- dbclient/ClustersClient.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 64cd2c9a..3a5af1c4 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -380,12 +380,12 @@ def remove_missing_users(self, api, acl_args, resp): if 'UserName' in resp['message']: missing_user = re.search(r'Principal: UserName\((.*)\) does not exist', resp['message']).group(1) logging.info(f"Removing missing user {missing_user} from ACL") - acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl['user_name'] != missing_user] + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('user_name', None) != missing_user] resp = self.put(api, acl_args) elif 'GroupName' in resp['message']: missing_group = re.search(r'Principal: GroupName\((.*)\) does not exist', resp['message']).group(1) logging.info(f"Removing missing group {missing_group} from ACL") - acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl['group_name'] != missing_group] + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('group_name', None) != missing_group] resp = self.put(api, acl_args) return resp From a41230c5be72deb1e65a4c436cdca5a0927301f0 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:08:28 -0500 Subject: [PATCH 084/111] Update ClustersClient.py corrected statement to assume default value of config 'groups_to_keep' to be False --- dbclient/ClustersClient.py | 61 +++++--------------------------------- 1 file changed, 8 insertions(+), 53 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 3a5af1c4..922a2706 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -1,6 +1,5 @@ import logging import os -import csv import re import time import logging_utils @@ -99,7 +98,7 @@ def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_clu if 'aws_attributes' in cluster_json: aws_conf = cluster_json.pop('aws_attributes') iam_role = aws_conf.get('instance_profile_arn', None) - if iam_role: + if not iam_role: cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role} return cluster_json @@ -257,20 +256,8 @@ def get_new_policy_id_dict(self, policy_file='cluster_policies.log'): old_policy_id = policy_conf['policy_id'] policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id return policy_id_dict - - def nitro_instance_mapping(self, instance_type_id): - dict_from_csv = {} - real_path = os.path.dirname(os.path.realpath(__file__)) - csv_file = f'{real_path}/../data/nitro_mapping.csv' - with open(csv_file, newline='', mode='r') as f: - reader = csv.DictReader(f) - for row in reader: - dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type'] - - nitro_instance_type_id = dict_from_csv[instance_type_id] - return nitro_instance_type_id - def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None, nitro=False): + def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None): """ Import cluster configs and update appropriate properties / tags in the new env :param log_file: @@ -314,12 +301,6 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus else: cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator} new_cluster_conf = cluster_conf - if nitro: - if 'node_type_id' in new_cluster_conf: - new_cluster_conf['node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id']) - if 'driver_node_type_id' in new_cluster_conf: - new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['driver_node_type_id']) - print("Creating cluster: {0}".format(new_cluster_conf['cluster_name'])) cluster_resp = self.post('/clusters/create', new_cluster_conf) if cluster_resp['http_status_code'] == 200: @@ -329,8 +310,6 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus if 'cluster_id' in cluster_conf: checkpoint_cluster_configs_set.write(cluster_conf['cluster_id']) else: - cluster_resp['old_cluster_id'] = cluster_conf['cluster_id'] - cluster_resp['old_cluster_name'] = cluster_conf['cluster_name'] logging_utils.log_response_error(error_logger, cluster_resp) print(cluster_resp) @@ -354,43 +333,18 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus raise ValueError(error_message) api = f'/preview/permissions/clusters/{cid}' resp = self.put(api, acl_args) - # remove user/group from ACL list if they don't exist if self.skip_missing_users: - ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"] + ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"] else: ignore_error_list = ["RESOURCE_ALREADY_EXISTS"] if logging_utils.check_error(resp, ignore_error_list): - if resp['error_code'] == 'RESOURCE_DOES_NOT_EXIST': - resp = self.remove_missing_users(api, acl_args, resp) - if not logging_utils.log_response_error(error_logger, resp): - if 'object_id' in data: - checkpoint_cluster_configs_set.write(data['object_id']) - else: - logging_utils.log_response_error(error_logger, resp) + logging_utils.log_response_error(error_logger, resp) elif 'object_id' in data: checkpoint_cluster_configs_set.write(data['object_id']) - def remove_missing_users(self, api, acl_args, resp): - # example message: 'Principal: UserName(x.x@email.com) does not exist' - # or 'Principal: GroupName(x.x) does not exist' - resp = self.put(api, acl_args) - while resp.get('error_code', '') == 'RESOURCE_DOES_NOT_EXIST': - if 'UserName' in resp['message']: - missing_user = re.search(r'Principal: UserName\((.*)\) does not exist', resp['message']).group(1) - logging.info(f"Removing missing user {missing_user} from ACL") - acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('user_name', None) != missing_user] - resp = self.put(api, acl_args) - elif 'GroupName' in resp['message']: - missing_group = re.search(r'Principal: GroupName\((.*)\) does not exist', resp['message']).group(1) - logging.info(f"Removing missing group {missing_group} from ACL") - acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('group_name', None) != missing_group] - resp = self.put(api, acl_args) - - return resp - - + print(resp) def _log_cluster_ids_and_original_creators( self, @@ -616,7 +570,7 @@ def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_cluster # get users list based on groups_to_keep users_list = [] - if self.groups_to_keep is not None: + if self.groups_to_keep is not False: all_users = self.get('/preview/scim/v2/Users').get('Resources', None) users_list = list(set([user.get("emails")[0].get("value") for user in all_users for group in user.get("groups") if group.get("display") in self.groups_to_keep])) @@ -692,7 +646,7 @@ def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='ac # get users list based on groups_to_keep users_list = [] - if self.groups_to_keep is not None: + if self.groups_to_keep is not False: all_users = self.get('/preview/scim/v2/Users').get('Resources', None) users_list = list(set([user.get("emails")[0].get("value") for user in all_users for group in user.get("groups") if @@ -820,3 +774,4 @@ def wait_for_cluster(self, cid): if c_state['state'] == 'TERMINATED': raise RuntimeError("Cluster is terminated. Please check EVENT history for details") return cid + From 922e5c717e139045e65d18b26da5b0839ac97f02 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:14:38 -0500 Subject: [PATCH 085/111] Update ClustersClient.py commented out SCIM mentions in ClustersClient --- dbclient/ClustersClient.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 922a2706..97099e53 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -570,10 +570,10 @@ def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_cluster # get users list based on groups_to_keep users_list = [] - if self.groups_to_keep is not False: - all_users = self.get('/preview/scim/v2/Users').get('Resources', None) - users_list = list(set([user.get("emails")[0].get("value") for user in all_users - for group in user.get("groups") if group.get("display") in self.groups_to_keep])) + #if self.groups_to_keep is not False: + # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + # users_list = list(set([user.get("emails")[0].get("value") for user in all_users + # for group in user.get("groups") if group.get("display") in self.groups_to_keep])) cluster_log = self.get_export_dir() + log_file acl_cluster_log = self.get_export_dir() + acl_log_file @@ -646,11 +646,11 @@ def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='ac # get users list based on groups_to_keep users_list = [] - if self.groups_to_keep is not False: - all_users = self.get('/preview/scim/v2/Users').get('Resources', None) - users_list = list(set([user.get("emails")[0].get("value") for user in all_users - for group in user.get("groups") if - group.get("display") in self.groups_to_keep])) + #if self.groups_to_keep is not False: + # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + # users_list = list(set([user.get("emails")[0].get("value") for user in all_users + # for group in user.get("groups") if + # group.get("display") in self.groups_to_keep])) # log cluster policy ACLs, which takes a policy id as arguments with open(acl_policies_log, 'w', encoding="utf-8") as acl_fp: From 452223ee1c6a0c55dd20848428d5285e6c711af4 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Tue, 6 Feb 2024 13:55:48 -0500 Subject: [PATCH 086/111] rename emails for specific edge case edge case: fetch users in e2 + create pattern from users --- data/notebooks/rename_emails.py | 184 +++++++++++++++++++++++++------- 1 file changed, 148 insertions(+), 36 deletions(-) diff --git a/data/notebooks/rename_emails.py b/data/notebooks/rename_emails.py index fbc49ca6..0f1b9ed0 100644 --- a/data/notebooks/rename_emails.py +++ b/data/notebooks/rename_emails.py @@ -2,6 +2,137 @@ import os import shutil import csv +import json +import requests + +class dbclient: + """ + Rest API Wrapper for Databricks APIs + """ + # set of http error codes to throw an exception if hit. Handles client and auth errors + http_error_codes = (401, 403) + + def __init__(self, token, url): + self._token = {'Authorization': 'Bearer {0}'.format(token)} + self._url = url.rstrip("/") + self._is_verbose = False + self._verify_ssl = False + if self._verify_ssl: + # set these env variables if skip SSL verification is enabled + os.environ['REQUESTS_CA_BUNDLE'] = "" + os.environ['CURL_CA_BUNDLE'] = "" + + def is_aws(self): + return self._is_aws + + def is_verbose(self): + return self._is_verbose + + def is_skip_failed(self): + return self._skip_failed + + def test_connection(self): + # verify the proper url settings to configure this client + if self._url[-4:] != '.com' and self._url[-4:] != '.net': + print("Hostname should end in '.com'") + return -1 + results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, + verify=self._verify_ssl) + http_status_code = results.status_code + if http_status_code != 200: + print("Error. Either the credentials have expired or the credentials don't have proper permissions.") + print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") + print(results.text) + return -1 + return 0 + + def get(self, endpoint, json_params=None, version='2.0', print_json=False): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("Get: {0}".format(full_endpoint)) + if json_params: + raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + else: + raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) + results = raw_results.json() + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + if type(results) == list: + results = {'elements': results} + results['http_status_code'] = raw_results.status_code + return results + + def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): + if version: + ver = version + full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint + if self.is_verbose(): + print("{0}: {1}".format(http_type, full_endpoint)) + if json_params: + if http_type == 'post': + if files_json: + raw_results = requests.post(full_endpoint, headers=self._token, + data=json_params, files=files_json, verify=self._verify_ssl) + else: + raw_results = requests.post(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'put': + raw_results = requests.put(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + if http_type == 'patch': + raw_results = requests.patch(full_endpoint, headers=self._token, + json=json_params, verify=self._verify_ssl) + + http_status_code = raw_results.status_code + if http_status_code in dbclient.http_error_codes: + raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, + http_status_code, + raw_results.text)) + results = raw_results.json() + else: + print("Must have a payload in json_args param.") + return {} + if print_json: + print(json.dumps(results, indent=4, sort_keys=True)) + # if results are empty, let's return the return status + if results: + results['http_status_code'] = raw_results.status_code + return results + else: + return {'http_status_code': raw_results.status_code} + + def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): + return self.http_req('post', endpoint, json_params, version, print_json, files_json) + + def put(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('put', endpoint, json_params, version, print_json) + + def patch(self, endpoint, json_params, version='2.0', print_json=False): + return self.http_req('patch', endpoint, json_params, version, print_json) + + +def get_user_id_mapping(client): + # return a dict of the userName to id mapping of the new env + user_list = client.get('/preview/scim/v2/Users').get('Resources', None) + if user_list: + user_mapping = {} + for user in user_list: + user_name = user['userName'] + # user_name: ex. ABC123@ccwdata.org + # we need to remove the domain name + user_mapping[user_name.split('@')[0]] = user_name + return user_mapping + return None + def pretty_print_dict(dict_): """ @@ -16,26 +147,21 @@ def pretty_print_dict(dict_): for key, value in dict_.items(): print(f"{key}: {value}") -def to_dict(csv_file, email_column='newEmail'): +def write_json(file_name, data_write): """ - summary: converts a csv or text file (or another comma delim file) into a - dictionary object + summary: writes parameter data_write to the path indicated by parameter + file_name PARAMETERS: - csv_file: path file of the comma delim file, assumes that there are no column - headings, each user address is split by a new line, and the old and new - address are split by a comma in that order. + file_name: path of the file that is to be written + data_write: text object RETURNS: - dict_from_csv: dictionary object where key is the old item and value - is new item + n/a """ - dict_from_csv = {} - with open(csv_file, newline='', mode='r') as f: - reader = csv.DictReader(f) - for row in reader: - dict_from_csv[row['userName']] = row[email_column] - return dict_from_csv + with open(file_name, "w") as f: + f.write(data_write) + def map(file_name, mapping): """ @@ -70,21 +196,6 @@ def map(file_name, mapping): return data -def write(file_name, data_write): - """ - summary: writes parameter data_write to the path indicated by parameter - file_name - - PARAMETERS: - file_name: path of the file that is to be written - data_write: text object - - RETURNS: - n/a - """ - with open(file_name, "w") as f: - f.write(data_write) - def rename_users_folder(mapping): """ summary: renames the user folder by moving all files to new directory @@ -126,20 +237,20 @@ def mapping_file(file_name, mapping): # this code here (directly referencing the number 4) assumes that the file name # has the 3 letter extension (e.g. something.txt or something.csv data = map(file_name, mapping) - write(file_name, data) + write_json(file_name, data) + def main(): all_args = argparse.ArgumentParser() - all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') - all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') - all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') + all_args.add_argument("--file", dest="file", required=True, help='files to map. e.g. logs/session') + all_args.add_argument("--host", dest="host", required=True, help='host of the new databricks env') + all_args.add_argument("--token", dest="token", required=True, help='token of the new databricks env') args = all_args.parse_args() file_name = args.file - mapping_file_ = args.mapping - email_column = args.column - mapping = to_dict(mapping_file_, email_column) + client = dbclient(args.token, args.host) + mapping = get_user_id_mapping(client) #mapping = {"admin": "ADMIN_NEW@GMAIL.COM", "service_principal": "service_principal_id"} print("--------------------") pretty_print_dict(mapping) @@ -148,6 +259,7 @@ def main(): if yesno.lower() != "y": exit() + # change the current working director to specified path #os.chdir("logs/session") os.chdir(file_name) From d2bdd48ecad8b6d7847faf8199b0cafcb0c78a6e Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:42:13 -0500 Subject: [PATCH 087/111] Delete data/notebooks/Clusters_Scout.py --- data/notebooks/Clusters_Scout.py | 197 ------------------------------- 1 file changed, 197 deletions(-) delete mode 100644 data/notebooks/Clusters_Scout.py diff --git a/data/notebooks/Clusters_Scout.py b/data/notebooks/Clusters_Scout.py deleted file mode 100644 index 6766a601..00000000 --- a/data/notebooks/Clusters_Scout.py +++ /dev/null @@ -1,197 +0,0 @@ -# Databricks notebook source -# Databricks notebook source -import json, os, datetime, requests -import requests.packages.urllib3 - -global pprint_j - -requests.packages.urllib3.disable_warnings() - - -# Helper to pretty print json -def pprint_j(i): - print(json.dumps(i, indent=4, sort_keys=True)) - - -class dbclient: - """ - Rest API Wrapper for Databricks APIs - """ - # set of http error codes to throw an exception if hit. Handles client and auth errors - http_error_codes = (401, 403) - - def __init__(self, token, url): - self._token = {'Authorization': 'Bearer {0}'.format(token)} - self._url = url.rstrip("/") - self._is_verbose = False - self._verify_ssl = False - if self._verify_ssl: - # set these env variables if skip SSL verification is enabled - os.environ['REQUESTS_CA_BUNDLE'] = "" - os.environ['CURL_CA_BUNDLE'] = "" - - def is_aws(self): - return self._is_aws - - def is_verbose(self): - return self._is_verbose - - def is_skip_failed(self): - return self._skip_failed - - def test_connection(self): - # verify the proper url settings to configure this client - if self._url[-4:] != '.com' and self._url[-4:] != '.net': - print("Hostname should end in '.com'") - return -1 - results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, - verify=self._verify_ssl) - http_status_code = results.status_code - if http_status_code != 200: - print("Error. Either the credentials have expired or the credentials don't have proper permissions.") - print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") - print(results.text) - return -1 - return 0 - - def get(self, endpoint, json_params=None, version='2.0', print_json=False): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("Get: {0}".format(full_endpoint)) - if json_params: - raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - else: - raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - if type(results) == list: - results = {'elements': results} - results['http_status_code'] = raw_results.status_code - return results - - def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("{0}: {1}".format(http_type, full_endpoint)) - if json_params: - if http_type == 'post': - if files_json: - raw_results = requests.post(full_endpoint, headers=self._token, - data=json_params, files=files_json, verify=self._verify_ssl) - else: - raw_results = requests.post(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'put': - raw_results = requests.put(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'patch': - raw_results = requests.patch(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, - http_status_code, - raw_results.text)) - results = raw_results.json() - else: - print("Must have a payload in json_args param.") - return {} - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - # if results are empty, let's return the return status - if results: - results['http_status_code'] = raw_results.status_code - return results - else: - return {'http_status_code': raw_results.status_code} - - def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): - return self.http_req('post', endpoint, json_params, version, print_json, files_json) - - def put(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('put', endpoint, json_params, version, print_json) - - def patch(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('patch', endpoint, json_params, version, print_json) - - @staticmethod - def my_map(F, items): - to_return = [] - for elem in items: - to_return.append(F(elem)) - return to_return - - def set_export_dir(self, dir_location): - self._export_dir = dir_location - - def get_export_dir(self): - return self._export_dir - - def get_latest_spark_version(self): - versions = self.get('/clusters/spark-versions')['versions'] - v_sorted = sorted(versions, key=lambda i: i['key'], reverse=True) - for x in v_sorted: - img_type = x['key'].split('-')[1][0:5] - if img_type == 'scala': - return x - - -# COMMAND ---------- - -class discoveryClient(dbclient): - def get_clusters(self): - clusters_list = self.get('/clusters/list').get('clusters', []) - return clusters_list - - def get_num_defined_clusters(self): - clusters_list = self.clusters_list() - return len(clusters_list) - -# COMMAND ---------- - -url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) -token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) - -client = discoveryClient(token, url) - -# COMMAND ---------- - -clusters = client.get_clusters() -clusters[0] - - -# COMMAND ---------- - -cluster_details = [] -for cluster in clusters: - cluster_id = cluster['cluster_id'] - cluster_name = cluster['cluster_name'] - creator = cluster['creator_user_name'] - node_type = cluster['node_type_id'] - driver_type = cluster['driver_node_type_id'] - custom_tags = cluster['custom_tags'] - spark_version = cluster['spark_version'] - instance_profile = cluster['aws_attributes'].get('instance_profile_arn', 'No Instance Profile') - cluster_details.append((cluster_id, cluster_name, creator, node_type, driver_type, custom_tags, spark_version, instance_profile)) - -# COMMAND ---------- - -columns = ['cluster_id', 'name', 'creator', 'node_type', 'driver_type', 'custom_tags', 'spark_version', 'instance_profile'] -spark.createDataFrame(data=cluster_details, schema = columns).write.mode("overwrite").saveAsTable("uc_discovery.clusters") - -# COMMAND ---------- - - From 919610fa85a716348952f71d8dc9e676ad7a6a52 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:43:36 -0500 Subject: [PATCH 088/111] Delete data/notebooks/create_sample_jobs.py --- data/notebooks/create_sample_jobs.py | 95 ---------------------------- 1 file changed, 95 deletions(-) delete mode 100644 data/notebooks/create_sample_jobs.py diff --git a/data/notebooks/create_sample_jobs.py b/data/notebooks/create_sample_jobs.py deleted file mode 100644 index 827ed1f3..00000000 --- a/data/notebooks/create_sample_jobs.py +++ /dev/null @@ -1,95 +0,0 @@ -import json -import pandas as pd -import csv -import os -import datetime -import argparse - -def read_log(file_name): - try: - with open("./all_jobs/"+file_name) as f: - data = f.read().split("\n") - return data[:-1] - except FileNotFoundError as e: - return '' - except Exception as e: - print("Error while reading file:", file_name, "\n", e) - return '' - -def move_logs(timestamp=""): - # moving all_jobs - os.rename("jobs.log", f"./all_jobs/jobs{timestamp}.log") - os.rename("acl_jobs.log", f"./all_jobs/acl_jobs{timestamp}.log") - -def write_job_log(data, sample_job_ids): - with open("jobs.log", "w") as jl: - for d in data: - try: - d = json.loads(d) - if d['job_id'] in sample_job_ids: - jl.write(json.dumps(d) + "\n") - except: - print("Error while writing jobs.log") - - -def write_job_acls_log(data, sample_job_ids): - with open("acl_jobs.log", "w") as jal: - for d in data: - try: - d = json.loads(d) - if int(d['object_id'].split("/")[-1]) in sample_job_ids: - jal.write(json.dumps(d) + "\n") - except: - print("Error while writing acl_jobs.log") - -def write_rest_job_logs(jobslog, acljobslog, sample_job_ids): - with open("other_jobs.log", "w") as ojl: - for d in jobslog: - try: - d = json.loads(d) - if d['job_id'] not in sample_job_ids: - ojl.write(json.dumps(d) + "\n") - except: - print("Error while writing other_jobs.log") - - with open("other_acl_jobs.log", "w") as ojal: - for d in acljobslog: - try: - d = json.loads(d) - if int(d['object_id'].split("/")[-1]) not in sample_job_ids: - ojal.write(json.dumps(d) + "\n") - except: - print("Error while writing other_acl_jobs.log") - -def main(): - # arguments - parser = argparse.ArgumentParser(description='Create sample jobs log') - parser.add_argument('--jobs', nargs='+', type=int, help='list of job ids to sample') - parser.add_argument('--logs', type=str, help='path to logs folder', default="logs/session/") # path to logs folder - - args = parser.parse_args() - job_ids = args.jobs - logs_path = args.logs - - os.chdir(logs_path) - - if "all_jobs" not in os.listdir(): - os.mkdir("./all_jobs/") - move_logs() - elif "jobs.log" in os.listdir(): - ts = datetime.datetime.now() - move_logs("_"+str(ts)) - - #json objects - job_log_data = read_log("jobs.log") - job_acl_log_data = read_log("acl_jobs.log") - - #move jobs.log into ./alljobs folder + write sample jobs log in main logs folder - write_job_log(job_log_data, job_ids) - write_job_acls_log(job_acl_log_data, job_ids) - - #write jobs.log that only contains jobs NOT in sample jobs log - write_rest_job_logs(job_log_data, job_acl_log_data, job_ids) - -if __name__ == "__main__": - main() From dd5cd460e6806370802fa5a4e00b1068ef538ba5 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:44:22 -0500 Subject: [PATCH 089/111] Delete data/notebooks/patch_clusters.py --- data/notebooks/patch_clusters.py | 239 ------------------------------- 1 file changed, 239 deletions(-) delete mode 100644 data/notebooks/patch_clusters.py diff --git a/data/notebooks/patch_clusters.py b/data/notebooks/patch_clusters.py deleted file mode 100644 index 1eae442a..00000000 --- a/data/notebooks/patch_clusters.py +++ /dev/null @@ -1,239 +0,0 @@ -import json -from datetime import datetime -import os -import requests -import argparse - -import warnings -warnings.filterwarnings("ignore") - -create_configs = {'num_workers', - 'autoscale', - 'cluster_name', - 'spark_version', - 'spark_conf', - 'aws_attributes', - 'node_type_id', - 'driver_node_type_id', - 'ssh_public_keys', - 'custom_tags', - 'cluster_log_conf', - 'init_scripts', - 'docker_image', - 'spark_env_vars', - 'autotermination_minutes', - 'enable_elastic_disk', - 'instance_pool_id', - 'driver_instance_pool_id', - 'policy_id', - 'pinned_by_user_name', - 'creator_user_name', - 'cluster_id', - 'data_security_mode'} - -class dbclient: - """ - Rest API Wrapper for Databricks APIs - """ - # set of http error codes to throw an exception if hit. Handles client and auth errors - http_error_codes = (401, 403) - - def __init__(self, token, url): - self._token = {'Authorization': 'Bearer {0}'.format(token)} - self._url = url.rstrip("/") - self._is_verbose = False - self._verify_ssl = False - if self._verify_ssl: - # set these env variables if skip SSL verification is enabled - os.environ['REQUESTS_CA_BUNDLE'] = "" - os.environ['CURL_CA_BUNDLE'] = "" - - def is_aws(self): - return self._is_aws - - def is_verbose(self): - return self._is_verbose - - def is_skip_failed(self): - return self._skip_failed - - def test_connection(self): - # verify the proper url settings to configure this client - if self._url[-4:] != '.com' and self._url[-4:] != '.net': - print("Hostname should end in '.com'") - return -1 - results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, - verify=self._verify_ssl) - http_status_code = results.status_code - if http_status_code != 200: - print("Error. Either the credentials have expired or the credentials don't have proper permissions.") - print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") - print(results.text) - return -1 - return 0 - - def get(self, endpoint, json_params=None, version='2.0', print_json=False): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("Get: {0}".format(full_endpoint)) - if json_params: - raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - else: - raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - if type(results) == list: - results = {'elements': results} - results['http_status_code'] = raw_results.status_code - return results - - def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("{0}: {1}".format(http_type, full_endpoint)) - if json_params: - if http_type == 'post': - if files_json: - raw_results = requests.post(full_endpoint, headers=self._token, - data=json_params, files=files_json, verify=self._verify_ssl) - else: - raw_results = requests.post(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'put': - raw_results = requests.put(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'patch': - raw_results = requests.patch(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, - http_status_code, - raw_results.text)) - results = raw_results.json() - else: - print("Must have a payload in json_args param.") - return {} - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - # if results are empty, let's return the return status - if results: - results['http_status_code'] = raw_results.status_code - return results - else: - return {'http_status_code': raw_results.status_code} - - def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): - return self.http_req('post', endpoint, json_params, version, print_json, files_json) - - def put(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('put', endpoint, json_params, version, print_json) - - def patch(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('patch', endpoint, json_params, version, print_json) - -def read_log(file_name): - """ - summary: reads a given log - """ - try: - with open(file_name) as f: - data = f.read().split("\n") - return data - except FileNotFoundError as e: - return print(f"{datetime.now()} Error: {file_name} not found. ") - except Exception as e: - print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ") - print(e) - return '' - -def get_clusters_list(client): - # databricks clusters list - endpoint = "/clusters/list" - clusters_list = client.get(endpoint).get('clusters', []) - return clusters_list - -def get_clusters_ips(log_name): - data = read_log(log_name) - instance_profiles = {} - for d in data: - if len(d) != 0: - d = d.strip() - d = json.loads(d) - c_name = d.get('cluster_name', 0) - ip = d.get('aws_attributes', {}).get('instance_profile_arn', 0) - if ip != 0: - instance_profiles[c_name] = ip - return instance_profiles - -def update_cluster_ips(client, instance_profiles): - cnames = get_clusters_list(client) - for c in cnames: - print(c) - if c.get('cluster_name', -1293) in instance_profiles.keys(): - c_name = c.get('cluster_name', 0) - c_id = c.get('cluster_id', 0) - current_cluster_json = client.get(f'/clusters/get?cluster_id={c_id}') - print(current_cluster_json) - run_properties = set(list(current_cluster_json.keys())) - create_configs - for p in run_properties: - del current_cluster_json[p] - if 'aws_attributes' in current_cluster_json: - aws_conf = current_cluster_json.pop('aws_attributes') - iam_role = instance_profiles[c_name] - aws_conf['instance_profile_arn'] = iam_role - else: - aws_conf = {} - iam_role = instance_profiles[c_name] - aws_conf['instance_profile_arn'] = iam_role - - current_cluster_json['aws_attributes'] = aws_conf - - edit_endpoint = "/clusters/edit" - results = client.post(edit_endpoint, current_cluster_json, print_json=False) - print(f"{datetime.now()} {c_name} was updated with {instance_profiles[c_name]}. Status code: {results.get('http_status_code', 0)}") - - return - -def confirm_updated_ips(client, instance_profiles): - cnames = get_clusters_list(client) - for c in cnames: # in updated e2 clusters - c_name = c.get('cluster_name', 0) - ip = c.get('aws_attributes', {}).get('instance_profile_arn', 0) # updated ip? - if c_name in instance_profiles.keys(): - if ip != 0: - if ip != instance_profiles[c_name]: - print(f"{datetime.now()} Error: {c_name} was not updated. ") - else: - print(f"{datetime.now()} {c_name} was updated. ") - else: - print(f"{datetime.now()} Error: {c_name} was not updated. ") - else: - print(f"{datetime.now()} {c_name} did not require update. ") - -if __name__ == "__main__": - # get the arguments - parser = argparse.ArgumentParser() - parser.add_argument("--log", help="log file to read", default="logs/session/clusters.log") - parser.add_argument("--token", help="databricks token to use", default="") - parser.add_argument("--url", help="databricks url to use", default="") - args = parser.parse_args() - - client = dbclient(args.token, args.url) - #cnames = get_clusters_list(client) - ips = get_clusters_ips(log_name=args.log) - update_cluster_ips(client, ips) - confirm_updated_ips(client, ips) From 47e3dc9293d605431f5c24caa16ab891cb9ce85b Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:46:40 -0500 Subject: [PATCH 090/111] Delete data/notebooks/rename_emails.py --- data/notebooks/rename_emails.py | 287 -------------------------------- 1 file changed, 287 deletions(-) delete mode 100644 data/notebooks/rename_emails.py diff --git a/data/notebooks/rename_emails.py b/data/notebooks/rename_emails.py deleted file mode 100644 index 0f1b9ed0..00000000 --- a/data/notebooks/rename_emails.py +++ /dev/null @@ -1,287 +0,0 @@ -import argparse -import os -import shutil -import csv -import json -import requests - -class dbclient: - """ - Rest API Wrapper for Databricks APIs - """ - # set of http error codes to throw an exception if hit. Handles client and auth errors - http_error_codes = (401, 403) - - def __init__(self, token, url): - self._token = {'Authorization': 'Bearer {0}'.format(token)} - self._url = url.rstrip("/") - self._is_verbose = False - self._verify_ssl = False - if self._verify_ssl: - # set these env variables if skip SSL verification is enabled - os.environ['REQUESTS_CA_BUNDLE'] = "" - os.environ['CURL_CA_BUNDLE'] = "" - - def is_aws(self): - return self._is_aws - - def is_verbose(self): - return self._is_verbose - - def is_skip_failed(self): - return self._skip_failed - - def test_connection(self): - # verify the proper url settings to configure this client - if self._url[-4:] != '.com' and self._url[-4:] != '.net': - print("Hostname should end in '.com'") - return -1 - results = requests.get(self._url + '/api/2.0/clusters/spark-versions', headers=self._token, - verify=self._verify_ssl) - http_status_code = results.status_code - if http_status_code != 200: - print("Error. Either the credentials have expired or the credentials don't have proper permissions.") - print("If you have a ~/.netrc file, check those credentials. Those take precedence over passed input.") - print(results.text) - return -1 - return 0 - - def get(self, endpoint, json_params=None, version='2.0', print_json=False): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("Get: {0}".format(full_endpoint)) - if json_params: - raw_results = requests.get(full_endpoint, headers=self._token, params=json_params, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - else: - raw_results = requests.get(full_endpoint, headers=self._token, verify=self._verify_ssl) - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: GET request failed with code {}\n{}".format(http_status_code, raw_results.text)) - results = raw_results.json() - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - if type(results) == list: - results = {'elements': results} - results['http_status_code'] = raw_results.status_code - return results - - def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=False, files_json=None): - if version: - ver = version - full_endpoint = self._url + '/api/{0}'.format(ver) + endpoint - if self.is_verbose(): - print("{0}: {1}".format(http_type, full_endpoint)) - if json_params: - if http_type == 'post': - if files_json: - raw_results = requests.post(full_endpoint, headers=self._token, - data=json_params, files=files_json, verify=self._verify_ssl) - else: - raw_results = requests.post(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'put': - raw_results = requests.put(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - if http_type == 'patch': - raw_results = requests.patch(full_endpoint, headers=self._token, - json=json_params, verify=self._verify_ssl) - - http_status_code = raw_results.status_code - if http_status_code in dbclient.http_error_codes: - raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type, - http_status_code, - raw_results.text)) - results = raw_results.json() - else: - print("Must have a payload in json_args param.") - return {} - if print_json: - print(json.dumps(results, indent=4, sort_keys=True)) - # if results are empty, let's return the return status - if results: - results['http_status_code'] = raw_results.status_code - return results - else: - return {'http_status_code': raw_results.status_code} - - def post(self, endpoint, json_params, version='2.0', print_json=False, files_json=None): - return self.http_req('post', endpoint, json_params, version, print_json, files_json) - - def put(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('put', endpoint, json_params, version, print_json) - - def patch(self, endpoint, json_params, version='2.0', print_json=False): - return self.http_req('patch', endpoint, json_params, version, print_json) - - -def get_user_id_mapping(client): - # return a dict of the userName to id mapping of the new env - user_list = client.get('/preview/scim/v2/Users').get('Resources', None) - if user_list: - user_mapping = {} - for user in user_list: - user_name = user['userName'] - # user_name: ex. ABC123@ccwdata.org - # we need to remove the domain name - user_mapping[user_name.split('@')[0]] = user_name - return user_mapping - return None - - -def pretty_print_dict(dict_): - """ - summary: prints a dictionary object in a pretty format - - PARAMETERS: - dict_: dictionary object - - RETURNS: - n/a - """ - for key, value in dict_.items(): - print(f"{key}: {value}") - -def write_json(file_name, data_write): - """ - summary: writes parameter data_write to the path indicated by parameter - file_name - - PARAMETERS: - file_name: path of the file that is to be written - data_write: text object - - RETURNS: - n/a - """ - with open(file_name, "w") as f: - f.write(data_write) - - -def map(file_name, mapping): - """ - summary: reads parameter file_name and replaces all places where previous email - address is used with the new item as indicated in mapping - - PARAMETERS: - file_name: path of the file that is to be read - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - data: a text object - - """ - with open(file_name, "r") as f: - data = f.read() - print(f" Currently mapping {file_name}") - for e in mapping: - if "@" in mapping[e]: # this is an user - data = data.replace(f"\"user_name\": \"{e}\"", f"\"user_name\": \"{mapping[e]}\"") # in most ACLs - print(f"\"/Users/{e}/") - print(f"\"/Users/{mapping[e]}/") - data = data.replace(f"\"/Users/{e}/", f"\"/Users/{mapping[e]}/") # in notebook paths - data = data.replace(f"\"display\": \"{e}\"", f"\"display\": \"{mapping[e]}\"") # in groups - data = data.replace(f"\"userName\": \"{e}\"", f"\"userName\": \"{mapping[e]}\"") # in groups - data = data.replace(f"\"principal\": \"{e}\"", f"\"principal\": \"{mapping[e]}\"") # in secret ACLs - else: # this is a service principal - data = data.replace(f"\"user_name\": \"{e}\"", f"\"service_principal_name\": \"{mapping[e]}\"") # in most ACLs - data = data.replace(f"\"display\": \"{e}\"", f"\"display\": \"{mapping[e]}\"") # in groups - data = data.replace(f"\"principal\": \"{e}\"", f"\"principal\": \"{mapping[e]}\"") # in secret ACLs - - return data - -def rename_users_folder(mapping): - """ - summary: renames the user folder by moving all files to new directory - - PARAMETERS: - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - n/a - """ - import shutil - - users = os.listdir('./artifacts/Users') - for u in users: - if '.DS_Store' not in u: - if mapping.get(u, False): - shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+mapping[u]) - else: - shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+u) - - os.rename("./artifacts/Users", "./artifacts/EmptyDir") # this is an empty dir - os.rename("./artifacts/NewUsers", "./artifacts/Users") - - -def mapping_file(file_name, mapping): - """ - summary: maps a single file and writes it to a new file and saves the old - log file with the '_prev' suffix - - PARAMETERS: - file_name: path of the file to map - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - n/a - """ - # this code here (directly referencing the number 4) assumes that the file name - # has the 3 letter extension (e.g. something.txt or something.csv - data = map(file_name, mapping) - write_json(file_name, data) - - -def main(): - all_args = argparse.ArgumentParser() - all_args.add_argument("--file", dest="file", required=True, help='files to map. e.g. logs/session') - all_args.add_argument("--host", dest="host", required=True, help='host of the new databricks env') - all_args.add_argument("--token", dest="token", required=True, help='token of the new databricks env') - - args = all_args.parse_args() - file_name = args.file - - client = dbclient(args.token, args.host) - mapping = get_user_id_mapping(client) - #mapping = {"admin": "ADMIN_NEW@GMAIL.COM", "service_principal": "service_principal_id"} - print("--------------------") - pretty_print_dict(mapping) - print("--------------------") - yesno = input("Confirm mapping (y/n): ") - if yesno.lower() != "y": - exit() - - - # change the current working director to specified path - #os.chdir("logs/session") - os.chdir(file_name) - # verify the path using getcwd() - cwd = os.getcwd() - print("--------------------") - print("Current working directory is:", cwd) - - logs = os.listdir() - - for file in logs: - # making sure we are only getting the logs - if ".log" in file: - mapping_file(file, mapping) - if "groups" == file: - groups = os.listdir("groups") - for g in groups: - if g != ".DS_Store": - mapping_file("groups/"+g, mapping) - - - #rename_users_folder(mapping) - -if __name__ == "__main__": - main() From 4d927e95217e71e48b3f39eaf5fa0d15145bd247 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:46:53 -0500 Subject: [PATCH 091/111] Delete data/notebooks/delete_clusters.py --- data/notebooks/delete_clusters.py | 42 ------------------------------- 1 file changed, 42 deletions(-) delete mode 100644 data/notebooks/delete_clusters.py diff --git a/data/notebooks/delete_clusters.py b/data/notebooks/delete_clusters.py deleted file mode 100644 index 116bc221..00000000 --- a/data/notebooks/delete_clusters.py +++ /dev/null @@ -1,42 +0,0 @@ -import requests - -# This script will delete all clusters in a Databricks workspace -# Set the Databricks API endpoint and access token - -# This script will delete all clusters in a Databricks workspace -# Set the Databricks API endpoint and access token -CURRENT_CLUSTER_ID = dbutils.notebook.entry_point.getDbutils().notebook().getContext().clusterId().getOrElse(None) -DATABRICKS_INSTANCE = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None) -DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) - -# Set the API endpoint and access token -api_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters/list" -access_token = DATABRICKS_TOKEN - -# Send a GET request to retrieve the list of clusters -response = requests.get(api_endpoint, headers={"Authorization": f"Bearer {access_token}"}) - -# Check if the request was successful -if response.status_code == 200: - clusters = response.json()["clusters"] - print(f"Found {len(clusters)} clusters") - # Delete each cluster - for cluster in clusters: - cluster_id = cluster["cluster_id"] - - print(f"Cluster {cluster_id} unpinned successfully") - - if cluster_id == CURRENT_CLUSTER_ID: - print(f"Skipping current cluster {cluster_id}") - continue - # Delete the cluster - delete_endpoint = f"https://{DATABRICKS_INSTANCE}/api/2.0/clusters//api/2.0/clusters/permanent-delete?cluster_id={cluster_id}" - delete_response = requests.post(delete_endpoint, headers={"Authorization": f"Bearer {access_token}"}) - - # Check if the cluster deletion was successful - if delete_response.status_code == 200: - print(f"Cluster {cluster_id} deleted successfully") - else: - print(f"Failed to delete cluster {cluster_id}") -else: - print("Failed to retrieve the list of clusters") From b0253449b6dc8b8306b5f15a19500dc94e222786 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Wed, 7 Feb 2024 15:47:03 -0500 Subject: [PATCH 092/111] Delete data/notebooks/replace_groups.py --- data/notebooks/replace_groups.py | 157 ------------------------------- 1 file changed, 157 deletions(-) delete mode 100644 data/notebooks/replace_groups.py diff --git a/data/notebooks/replace_groups.py b/data/notebooks/replace_groups.py deleted file mode 100644 index f577a46e..00000000 --- a/data/notebooks/replace_groups.py +++ /dev/null @@ -1,157 +0,0 @@ -import argparse -import os -import csv - -def pretty_print_dict(dict_): - """ - summary: prints a dictionary object in a pretty format - - PARAMETERS: - dict_: dictionary object - - RETURNS: - n/a - """ - for key, value in dict_.items(): - print(f"{key}: {value}") - -def to_dict(csv_file, email_column='newEmail'): - """ - summary: converts a csv or text file (or another comma delim file) into a - dictionary object - - PARAMETERS: - csv_file: path file of the comma delim file, assumes that there are no column - headings, each user address is split by a new line, and the old and new - address are split by a comma in that order. - - RETURNS: - dict_from_csv: dictionary object where key is the old item and value - is new item - """ - dict_from_csv = {} - with open(csv_file, newline='', mode='r') as f: - reader = csv.DictReader(f) - for row in reader: - dict_from_csv[row['group_name']] = row[email_column] - return dict_from_csv - -def map(file_name, mapping): - """ - summary: reads parameter file_name and replaces all places where previous email - address is used with the new item as indicated in mapping - - PARAMETERS: - file_name: path of the file that is to be read - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - data: a text object - - """ - print(f" Currently mapping {file_name}") - with open(file_name, "r") as f: - data = f.read() - for e in mapping: - data = data.replace(e, mapping[e]) - return data - -def write(file_name, data_write): - """ - summary: writes parameter data_write to the path indicated by parameter - file_name - - PARAMETERS: - file_name: path of the file that is to be written - data_write: text object - - RETURNS: - n/a - """ - with open(file_name, "w") as f: - f.write(data_write) - - -def mapping_file(file_name, mapping): - """ - summary: maps a single file and writes it to a new file and saves the old - log file with the '_prev' suffix - - PARAMETERS: - file_name: path of the file to map - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - n/a - """ - # this code here (directly referencing the number 4) assumes that the file name - # has the 3 letter extension (e.g. something.txt or something.csv - data = map(file_name, mapping) - write(file_name, data) - -def rename_group_file(mapping): - """ - summary: renames the user folder by moving all files to new directory - - PARAMETERS: - mapping: dict where key is the previous item and value is the - new item - - RETURNS: - n/a - """ - groups = os.listdir('groups') - for g in groups: - if '.DS_Store' in g: - continue - if mapping.get(g, False): - os.rename("groups/"+g, "groups/"+mapping[g]) - -def main(): - all_args = argparse.ArgumentParser() - all_args.add_argument("--dir", "--file", dest="file", required=True, help='directory needs to be updated via mapping.') - all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file') - all_args.add_argument("--new-email-column", dest="column", required=True, help='email column in the mapping file with updated email addresses') - - args = all_args.parse_args() - file_name = args.file - mapping_file_ = args.mapping - email_column = args.column - - mapping = to_dict(mapping_file_, email_column) - mapping = {"old_group_name": "new_group_name"} - print("--------------------") - pretty_print_dict(mapping) - print("--------------------") - yesno = input("Confirm mapping (y/n): ") - if yesno.lower() != "y": - exit() - - # change the current working director to specified path - os.chdir(file_name) - # verify the path using getcwd() - cwd = os.getcwd() - print("--------------------") - print("Current working directory is:", cwd) - - logs = os.listdir() - - for file in logs: - if '.DS_Store' in file: - continue - # making sure we are only getting the logs - if ".log" in file: - mapping_file(file, mapping) - if "groups" == file: - groups = os.listdir("groups") - for g in groups: - if '.DS_Store' in g: - continue - mapping_file("groups/"+g, mapping) - - rename_group_file(mapping) - -if __name__ == "__main__": - main() From a0ba5e6193e3461eb4c58f51b720582a156ab395 Mon Sep 17 00:00:00 2001 From: Veena <31749302+veenaramesh@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:28:59 -0500 Subject: [PATCH 093/111] Update ClustersClient.py --- dbclient/ClustersClient.py | 70 +++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index 97099e53..ac75b63f 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -1,5 +1,6 @@ import logging import os +import csv import re import time import logging_utils @@ -98,7 +99,7 @@ def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_clu if 'aws_attributes' in cluster_json: aws_conf = cluster_json.pop('aws_attributes') iam_role = aws_conf.get('instance_profile_arn', None) - if not iam_role: + if iam_role: cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role} return cluster_json @@ -257,7 +258,19 @@ def get_new_policy_id_dict(self, policy_file='cluster_policies.log'): policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id return policy_id_dict - def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None): + def nitro_instance_mapping(self, instance_type_id): + dict_from_csv = {} + real_path = os.path.dirname(os.path.realpath(__file__)) + csv_file = f'{real_path}/../data/nitro_mapping.csv' + with open(csv_file, newline='', mode='r') as f: + reader = csv.DictReader(f) + for row in reader: + dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type'] + + nitro_instance_type_id = dict_from_csv[instance_type_id] + return nitro_instance_type_id + + def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None, nitro=False): """ Import cluster configs and update appropriate properties / tags in the new env :param log_file: @@ -301,6 +314,12 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus else: cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator} new_cluster_conf = cluster_conf + if nitro: + if 'node_type_id' in new_cluster_conf: + new_cluster_conf['node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id']) + if 'driver_node_type_id' in new_cluster_conf: + new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['driver_node_type_id']) + print("Creating cluster: {0}".format(new_cluster_conf['cluster_name'])) cluster_resp = self.post('/clusters/create', new_cluster_conf) if cluster_resp['http_status_code'] == 200: @@ -310,6 +329,9 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus if 'cluster_id' in cluster_conf: checkpoint_cluster_configs_set.write(cluster_conf['cluster_id']) else: + cluster_resp['old_cluster_id'] = cluster_conf['cluster_id'] + cluster_resp['old_cluster_name'] = cluster_conf['cluster_name'] + logging_utils.log_response_error(error_logger, cluster_resp) print(cluster_resp) @@ -338,14 +360,38 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"] else: ignore_error_list = ["RESOURCE_ALREADY_EXISTS"] - + if logging_utils.check_error(resp, ignore_error_list): + if resp['error_code'] == 'RESOURCE_DOES_NOT_EXIST': + resp = self.remove_missing_users(api, acl_args, resp) + if not logging_utils.log_response_error(error_logger, resp): + if 'object_id' in data: + checkpoint_cluster_configs_set.write(data['object_id']) + else: + logging_utils.log_response_error(error_logger, resp) logging_utils.log_response_error(error_logger, resp) elif 'object_id' in data: checkpoint_cluster_configs_set.write(data['object_id']) print(resp) + def remove_missing_users(self, api, acl_args, resp): + # example message: 'Principal: UserName(x.x@email.com) does not exist' + # or 'Principal: GroupName(x.x) does not exist' + resp = self.put(api, acl_args) + while resp.get('error_code', '') == 'RESOURCE_DOES_NOT_EXIST': + if 'UserName' in resp['message']: + missing_user = re.search(r'Principal: UserName\((.*)\) does not exist', resp['message']).group(1) + logging.info(f"Removing missing user {missing_user} from ACL") + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('user_name', None) != missing_user] + resp = self.put(api, acl_args) + elif 'GroupName' in resp['message']: + missing_group = re.search(r'Principal: GroupName\((.*)\) does not exist', resp['message']).group(1) + logging.info(f"Removing missing group {missing_group} from ACL") + acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('group_name', None) != missing_group] + resp = self.put(api, acl_args) + return resp + def _log_cluster_ids_and_original_creators( self, cluster_log_file, @@ -570,10 +616,10 @@ def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_cluster # get users list based on groups_to_keep users_list = [] - #if self.groups_to_keep is not False: - # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) - # users_list = list(set([user.get("emails")[0].get("value") for user in all_users - # for group in user.get("groups") if group.get("display") in self.groups_to_keep])) + if self.groups_to_keep is not False: + all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + users_list = list(set([user.get("emails")[0].get("value") for user in all_users + for group in user.get("groups") if group.get("display") in self.groups_to_keep])) cluster_log = self.get_export_dir() + log_file acl_cluster_log = self.get_export_dir() + acl_log_file @@ -646,11 +692,11 @@ def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='ac # get users list based on groups_to_keep users_list = [] - #if self.groups_to_keep is not False: - # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) - # users_list = list(set([user.get("emails")[0].get("value") for user in all_users - # for group in user.get("groups") if - # group.get("display") in self.groups_to_keep])) + if self.groups_to_keep is not False: + all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + users_list = list(set([user.get("emails")[0].get("value") for user in all_users + for group in user.get("groups") if + group.get("display") in self.groups_to_keep])) # log cluster policy ACLs, which takes a policy id as arguments with open(acl_policies_log, 'w', encoding="utf-8") as acl_fp: From d89bfe2d2d88229fba9f88aef7300c8547c182c3 Mon Sep 17 00:00:00 2001 From: cbartholomew2 <89409387+cbartholomew2@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:50:42 -0400 Subject: [PATCH 094/111] Add files via upload --- WorkspaceClient_modified.py | 971 ++++++++++++++++++++++++++++++++++++ 1 file changed, 971 insertions(+) create mode 100644 WorkspaceClient_modified.py diff --git a/WorkspaceClient_modified.py b/WorkspaceClient_modified.py new file mode 100644 index 00000000..e39fd405 --- /dev/null +++ b/WorkspaceClient_modified.py @@ -0,0 +1,971 @@ +import base64 +import hashlib +import re + +from dbclient import * +import wmconstants +import concurrent +from concurrent.futures import ThreadPoolExecutor +from thread_safe_writer import ThreadSafeWriter +from threading_utils import propagate_exceptions +from timeit import default_timer as timer +from datetime import timedelta +import logging_utils +import logging +import os +from dbclient.common.WorkspaceDiff import * + +WS_LIST = "/workspace/list" +WS_STATUS = "/workspace/get-status" +WS_MKDIRS = "/workspace/mkdirs" +WS_IMPORT = "/workspace/import" +WS_EXPORT = "/workspace/export" +LS_ZONES = "/clusters/list-zones" +REPOS = "/repos" + +class WorkspaceClient(dbclient): + def __init__(self, configs, checkpoint_service): + super().__init__(configs) + self._checkpoint_service = checkpoint_service + self.groups_to_keep = configs.get("groups_to_keep", False) + self.skip_missing_users = configs['skip_missing_users'] + + _languages = {'.py': 'PYTHON', + '.scala': 'SCALA', + '.r': 'R', + '.sql': 'SQL'} + + def get_language(self, file_ext): + return self._languages[file_ext] + + def get_top_level_folders(self): + # get top level folders excluding the /Users path + supported_types = ('NOTEBOOK', 'DIRECTORY') + root_items = self.get(WS_LIST, {'path': '/'}).get('objects', []) + # filter out Projects and Users folders + non_users_dir = list(filter(lambda x: (x.get('path') not in ['/Users', '/Repos'] + and x.get('path') != '/Projects'), root_items)) + dirs_and_nbs = list(filter(lambda x: (x.get('object_type') in supported_types), + non_users_dir)) + return dirs_and_nbs + + def export_top_level_folders(self): + ls_tld = self.get_top_level_folders() + logged_nb_count = 0 + workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_workspace.log', "a") + libs_log_writer = ThreadSafeWriter(self.get_export_dir() + 'libraries.log', "a") + dir_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_dirs.log', "a") + checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT + ) + try: + for tld_obj in ls_tld: + # obj has 3 keys, object_type, path, object_id + tld_path = tld_obj.get('path') + log_count = self.log_all_workspace_items( + tld_path, workspace_log_writer, libs_log_writer, dir_log_writer, checkpoint_item_log_set) + logged_nb_count += log_count + finally: + workspace_log_writer.close() + libs_log_writer.close() + dir_log_writer.close() + dl_nb_count = self.download_notebooks() + print(f'Total logged notebooks: {logged_nb_count}') + print(f'Total Downloaded notebooks: {dl_nb_count}') + + def get_user_import_args(self, full_local_path, nb_full_path): + """ + helper function to define the import parameters to upload a notebook object + :param full_local_path: full local path of the notebook to read + :param nb_full_path: full destination path, e.g. /Users/foo@db.com/bar.dbc . Includes extension / type + :return: return the full input args to upload to the destination system + """ + fp = open(full_local_path, "rb") + (nb_path_dest, nb_type) = os.path.splitext(nb_full_path) + in_args = { + "content": base64.encodebytes(fp.read()).decode('utf-8'), + "path": nb_path_dest, + "format": self.get_file_format() + } + if self.is_source_file_format(): + if self.is_overwrite_notebooks(): + in_args['overwrite'] = True + if nb_type == '.dbc': + raise ValueError('Export is in DBC default format. Must export as SOURCE') + in_args['language'] = self.get_language(nb_type) + in_args['object_type'] = 'NOTEBOOK' + return in_args + + @staticmethod + def build_ws_lookup_table(success_ws_logfile): + ws_hashmap = set() + with open(success_ws_logfile, 'r', encoding='utf-8') as fp: + for line in fp: + ws_hashmap.add(line.rstrip()) + return ws_hashmap + + @staticmethod + def is_user_ws_item(ws_dir): + """ + Checks if this is a user artifact / notebook. + We can't create user home folders, hence we need to identify user items + """ + path_list = [x for x in ws_dir.split('/') if x] + if len(path_list) >= 2 and path_list[0] == 'Users': + return True + return False + + @staticmethod + def is_repo(ws_dir): + """ + Checks if this item is part of a repo. + We need to use a separate API for these, so they should not be treated as standard WS items + """ + path_list = [x for x in ws_dir.split('/') if x] + if len(path_list) >= 2 and path_list[0] == 'Repos': + return True + return False + + @staticmethod + def is_user_ws_root(ws_dir): + """ + Check if we're at the users home folder or repos root folder to skip folder creation + """ + if ws_dir in ['/Users/', '/Users', '/Repos/', '/Repos']: + return True + path_list = [x for x in ws_dir.split('/') if x] + if len(path_list) == 2 and path_list[0] == 'Users': + return True + return False + + @staticmethod + def get_user(ws_dir): + """ + returns the username of the workspace / folder path + """ + path_list = [x for x in ws_dir.split('/') if x] + if len(path_list) < 2: + raise ValueError("Error: Not a users workspace directory") + return path_list[1] + + @staticmethod + def is_user_trash(ws_path): + """ + checks if this is the users home folder trash directory, which is a special dir + """ + path_list = ws_path.split('/') + if len(path_list) == 4: + if path_list[1] == 'Users' and path_list[3] == 'Trash': + return True + return False + + def is_user_home_empty(self, username): + user_root = '/Users/' + username.rstrip().lstrip() + get_args = {'path': user_root} + items = self.get(WS_LIST, get_args).get('objects', None) + if items: + folders = self.filter_workspace_items(items, 'DIRECTORY') + notebooks = self.filter_workspace_items(items, 'NOTEBOOK') + # if both notebooks and directories are empty, return true + if not folders and not notebooks: + return True + return False + return True + + def get_num_of_saved_users(self, export_dir): + """ + returns the number of exported user items to check against number of created users in the new workspace + this helps identify if the new workspace is ready for the import, or if we should skip / archive failed imports + """ + # get current number of saved workspaces + user_home_dir = export_dir + 'Users' + num_of_users = 0 + if os.path.exists(user_home_dir): + ls = self.listdir(user_home_dir) + for x in ls: + if os.path.isdir(user_home_dir + '/' + x): + num_of_users += 1 + return num_of_users + + def export_user_home(self, username, local_export_dir, num_parallel=4): + """ + Export the provided user's home directory + :param username: user's home directory to export + :param local_export_dir: folder location to do single user exports + :return: None + """ + original_export_dir = self.get_export_dir() + user_export_dir = self.get_export_dir() + local_export_dir + user_root = '/Users/' + username.rstrip().lstrip() + self.set_export_dir(user_export_dir + '/{0}/'.format(username)) + print("Export path: {0}".format(self.get_export_dir())) + os.makedirs(self.get_export_dir(), exist_ok=True) + workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_workspace.log', "a") + libs_log_writer = ThreadSafeWriter(self.get_export_dir() + 'libraries.log', "a") + dir_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_dirs.log', "a") + checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT + ) + try: + num_of_nbs = self.log_all_workspace_items( + user_root, workspace_log_writer, libs_log_writer, dir_log_writer, checkpoint_item_log_set) + finally: + workspace_log_writer.close() + libs_log_writer.close() + dir_log_writer.close() + + if num_of_nbs == 0: + raise ValueError('User does not have any notebooks in this path. Please verify the case of the email') + num_of_nbs_dl = self.download_notebooks(ws_dir='user_artifacts/') + print(f"Total notebooks logged: {num_of_nbs}") + print(f"Total notebooks downloaded: {num_of_nbs_dl}") + if num_of_nbs != num_of_nbs_dl: + print(f"Notebooks logged != downloaded. Check the failed download file at: {user_export_dir}") + print(f"Exporting the notebook permissions for {username}") + acl_notebooks_writer = ThreadSafeWriter("acl_notebooks.log", "w") + acl_notebooks_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir()) + try: + self.log_acl_to_file( + 'notebooks', 'user_workspace.log', acl_notebooks_writer, acl_notebooks_error_logger, num_parallel) + finally: + acl_notebooks_writer.close() + + print(f"Exporting the directories permissions for {username}") + acl_directories_writer = ThreadSafeWriter("acl_directories.log", "w") + acl_directories_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir()) + try: + self.log_acl_to_file( + 'directories', 'user_dirs.log', acl_directories_writer, acl_directories_error_logger, num_parallel) + finally: + acl_directories_writer.close() + # reset the original export dir for other calls to this method using the same client + self.set_export_dir(original_export_dir) + + def import_user_home(self, username, local_export_dir): + """ + Import the provided user's home directory + logs/user_exports/{{USERNAME}}/ stores the log files to understand what was exported + logs/user_exports/{{USERNAME}}/user_artifacts/ stores the notebook contents + :param username: user's home directory to export + :param local_export_dir: the log directory for this users workspace items + :return: None + """ + original_export_dir = self.get_export_dir() + user_import_dir = self.get_export_dir() + local_export_dir + if self.does_user_exist(username): + print("Yes, we can upload since the user exists") + else: + print("User must exist before we upload the notebook contents. Please add the user to the platform first") + user_root = '/Users/' + username.rstrip().lstrip() + self.set_export_dir(user_import_dir + '/{0}/'.format(username)) + print("Import local path: {0}".format(self.get_export_dir())) + notebook_dir = self.get_export_dir() + 'user_artifacts/' + for root, subdirs, files in self.walk(notebook_dir): + upload_dir = '/' + root.replace(notebook_dir, '') + # if the upload dir is the 2 root directories, skip and continue + if upload_dir == '/' or upload_dir == '/Users': + continue + if not self.is_user_ws_root(upload_dir): + # if it is not the /Users/example@example.com/ root path, don't create the folder + resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) + print(resp_mkdirs) + for f in files: + # get full path for the local notebook file + local_file_path = os.path.join(root, f) + # create upload path and remove file format extension + ws_file_path = upload_dir + '/' + f + # generate json args with binary data for notebook to upload to the workspace path + nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) + # call import to the workspace + if self.is_verbose(): + print("Path: {0}".format(nb_input_args['path'])) + resp_upload = self.post(WS_IMPORT, nb_input_args) + if self.is_verbose(): + print(resp_upload) + + # import the user's workspace ACLs + notebook_acl_logs = user_import_dir + f'/{username}/acl_notebooks.log' + acl_notebooks_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir()) + if os.path.exists(notebook_acl_logs): + print(f"Importing the notebook acls for {username}") + with open(notebook_acl_logs, encoding='utf-8') as nb_acls_fp: + for nb_acl_str in nb_acls_fp: + self.apply_acl_on_object(nb_acl_str, acl_notebooks_error_logger) + + dir_acl_logs = user_import_dir + f'/{username}/acl_directories.log' + acl_dir_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir()) + if os.path.exists(dir_acl_logs): + print(f"Importing the directory acls for {username}") + with open(dir_acl_logs, encoding='utf-8') as dir_acls_fp: + for dir_acl_str in dir_acls_fp: + self.apply_acl_on_object(dir_acl_str, acl_dir_error_logger) + self.set_export_dir(original_export_dir) + + def download_notebooks(self, ws_log_file='user_workspace.log', ws_dir='artifacts/', num_parallel=4): + """ + Loop through all notebook paths in the logfile and download individual notebooks + :param ws_log_file: logfile for all notebook paths in the workspace + :param ws_dir: export directory to store all notebooks + :return: None + """ + checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT) + ws_log = self.get_export_dir() + ws_log_file + notebook_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT, self.get_export_dir()) + num_notebooks = 0 + if not os.path.exists(ws_log): + raise Exception("Run --workspace first to download full log of all notebooks.") + with open(ws_log, "r", encoding='utf-8') as fp: + # notebook log metadata file now contains object_id to help w/ ACL exports + # pull the path from the data to download the individual notebook contents + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(self.download_notebook_helper, notebook_data, checkpoint_notebook_set, notebook_error_logger, self.get_export_dir() + ws_dir) for notebook_data in fp] + for future in concurrent.futures.as_completed(futures): + dl_resp = future.result() + if 'error' not in dl_resp: + num_notebooks += 1 + return num_notebooks + + def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error_logger, export_dir='artifacts/'): + """ + Helper function to download an individual notebook, or log the failure in the failure logfile + :param notebook_path: an individual notebook path + :param export_dir: directory to store all notebooks + :return: return the notebook path that's successfully downloaded + """ + notebook_path = json.loads(notebook_data).get('path', None).rstrip('\n') + if checkpoint_notebook_set.contains(notebook_path): + return {'path': notebook_path} + get_args = {'path': notebook_path, 'format': self.get_file_format()} + if self.is_verbose(): + logging.info("Downloading: {0}".format(get_args['path'])) + resp = self.get(WS_EXPORT, get_args) + if resp.get('error', None): + resp['path'] = notebook_path + logging_utils.log_response_error(error_logger, resp) + return resp + if resp.get('error_code', None): + resp['path'] = notebook_path + logging_utils.log_response_error(error_logger, resp) + return resp + nb_path = os.path.dirname(notebook_path) + if nb_path != '/': + # path is NOT empty, remove the trailing slash from export_dir + save_path = export_dir[:-1] + nb_path + '/' + else: + save_path = export_dir + + # If the local path doesn't exist,we create it before we save the contents + if not os.path.exists(save_path) and save_path: + os.makedirs(save_path, exist_ok=True) + + save_filename = save_path + os.path.basename(notebook_path) + '.' + resp.get('file_type') + if os.path.isfile(save_filename): + logging.warning(f"Notebook file {save_filename} already exists; please rename in source workspace. " + f"Note that files are case-insensitive") + return {} + logging.info(save_filename) + with open(save_filename, "wb") as f: + f.write(base64.b64decode(resp['content'])) + checkpoint_notebook_set.write(notebook_path) + return {'path': notebook_path} + + def filter_workspace_items(self, item_list, item_type): + """ + Helper function to filter on different workspace types. + :param item_list: iterable of workspace items + :param item_type: DIRECTORY, NOTEBOOK, LIBRARY + :return: list of items filtered by type + """ + supported_types = {'DIRECTORY', 'NOTEBOOK', 'LIBRARY'} + if item_type not in supported_types: + raise ValueError('Unsupported type provided: {0}.\n. Supported types: {1}'.format(item_type, + str(supported_types))) + filtered_list = list(self.my_map(lambda y: {'path': y.get('path', None), + 'object_id': y.get('object_id', None)}, + filter(lambda x: x.get('object_type', None) == item_type, item_list))) + return filtered_list + + def init_workspace_logfiles(self, workspace_log_file='user_workspace.log', + libs_log_file='libraries.log', workspace_dir_log_file='user_dirs.log'): + """ + initialize the logfile locations since we run a recursive function to download notebooks + """ + workspace_log = self.get_export_dir() + workspace_log_file + libs_log = self.get_export_dir() + libs_log_file + workspace_dir_log = self.get_export_dir() + workspace_dir_log_file + if not self._checkpoint_service.checkpoint_file_exists(wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT): + if os.path.exists(workspace_log): + os.remove(workspace_log) + if os.path.exists(workspace_dir_log): + os.remove(workspace_dir_log) + if os.path.exists(libs_log): + os.remove(libs_log) + + def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_workspace.log', libs_log_file='libraries.log', dir_log_file='user_dirs.log', repos_log_file='repos.log', exclude_prefixes=[]): + logging.info(f"Skip all paths with the following prefixes: {exclude_prefixes}") + + workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + workspace_log_file, "a") + libs_log_writer = ThreadSafeWriter(self.get_export_dir() + libs_log_file, "a") + dir_log_writer = ThreadSafeWriter(self.get_export_dir() + dir_log_file, "a") + #repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a") + checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT + ) + try: + num_nbs = self.log_all_workspace_items(ws_path=ws_path, + workspace_log_writer=workspace_log_writer, + libs_log_writer=libs_log_writer, + dir_log_writer=dir_log_writer, + repos_log_writer=None, + checkpoint_set=checkpoint_item_log_set, + exclude_prefixes=exclude_prefixes) + finally: + workspace_log_writer.close() + libs_log_writer.close() + dir_log_writer.close() + #repos_log_writer.close() + + return num_nbs + + def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer, dir_log_writer, repos_log_writer, checkpoint_set, exclude_prefixes=[]): + """ + Loop and log all workspace items to download them at a later time + :param ws_path: root path to log all the items of the notebook workspace + :param workspace_log_file: logfile to store all the paths of the notebooks + :param libs_log_file: library logfile to store workspace libraries + :param dir_log_file: log directory for users + :return: + """ + # define log file names for notebooks, folders, and libraries + if ws_path == '/': + # default is the root path + get_args = {'path': '/'} + else: + get_args = {'path': ws_path} + + if not os.path.exists(self.get_export_dir()): + os.makedirs(self.get_export_dir(), exist_ok=True) + items = self.get(WS_LIST, get_args).get('objects', None) + #repos = self.get(REPOS).get('repos', None) + num_nbs = 0 + if self.is_verbose(): + logging.info("Listing: {0}".format(get_args['path'])) + if items: + # list all the users folders only + folders = self.filter_workspace_items(items, 'DIRECTORY') + # should be no notebooks, but lets filter and can check later + notebooks = self.filter_workspace_items(items, 'NOTEBOOK') + libraries = self.filter_workspace_items(items, 'LIBRARY') + # only get user list if we are filtering by group + ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] + for x in notebooks: + # notebook objects has path and object_id + nb_path = x.get('path') + + # if the current user is not in kept groups, skip this nb + if self.groups_to_keep and self.is_user_ws_item(nb_path): + nb_user = self.get_user(nb_path) + user_groups = [group.get("display") for user in ws_users if user.get("emails")[0].get("value") == nb_user for group in user.get("groups")] + if not set(user_groups).intersection(set(self.groups_to_keep)): + if self.is_verbose(): + logging.info("Skipped notebook path due to group exclusion: {0}".format(x.get('path'))) + continue + + if not checkpoint_set.contains(nb_path) and not nb_path.startswith(tuple(exclude_prefixes)): + if self.is_verbose(): + logging.info("Saving path: {0}".format(x.get('path'))) + workspace_log_writer.write(json.dumps(x) + '\n') + checkpoint_set.write(nb_path) + num_nbs += 1 + for y in libraries: + lib_path = y.get('path') + + # if the current user is not in kept groups, skip this lib + if self.groups_to_keep and self.is_user_ws_item(lib_path): + nb_user = self.get_user(lib_path) + user_groups = [group.get("display") for user in ws_users if user.get("emails")[0].get("value") == nb_user for group in user.get("groups")] + if not set(user_groups).intersection(set(self.groups_to_keep)): + if self.is_verbose(): + logging.info("Skipped library path due to group exclusion: {0}".format(lib_path)) + continue + + if not checkpoint_set.contains(lib_path) and not lib_path.startswith(tuple(exclude_prefixes)): + libs_log_writer.write(json.dumps(y) + '\n') + checkpoint_set.write(lib_path) + # log all directories to export permissions + if folders: + def _recurse_log_all_workspace_items(folder): + dir_path = folder.get('path', None) + if not self.is_user_trash(dir_path) and not self.is_repo(dir_path): + dir_log_writer.write(json.dumps(folder) + '\n') + return self.log_all_workspace_items(ws_path=dir_path, + workspace_log_writer=workspace_log_writer, + libs_log_writer=libs_log_writer, + dir_log_writer=dir_log_writer, + repos_log_writer=None, + checkpoint_set=checkpoint_set, + exclude_prefixes=exclude_prefixes) + + for folder in folders: + dir_path = folder.get('path', None) + + # if the current user is not in kept groups, skip this dir + if self.groups_to_keep and self.is_user_ws_item(dir_path): + dir_user = self.get_user(dir_path) + user_groups = [group.get("display") for user in ws_users if + user.get("emails")[0].get("value") == dir_user for group in user.get("groups")] + if not set(user_groups).intersection(set(self.groups_to_keep)): + if self.is_verbose(): + logging.info("Skipped directory due to group exclusion: {0}".format(dir_path)) + continue + + if not checkpoint_set.contains(dir_path) and not dir_path.startswith(tuple(exclude_prefixes)): + num_nbs_plus = _recurse_log_all_workspace_items(folder) + checkpoint_set.write(dir_path) + if num_nbs_plus: + num_nbs += num_nbs_plus + # log all repos + + # if repos_log_writer and repos: + # for repo in repos: + # repo_path = repo.get('path', "") + # if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)): + # repos_log_writer.write(json.dumps(repo) + '\n') + # checkpoint_set.write(repo_path) + + return num_nbs + + def get_obj_id_by_path(self, input_path): + resp = self.get(WS_STATUS, {'path': input_path}) + obj_id = resp.get('object_id', None) + return obj_id + + def log_acl_to_file(self, artifact_type, read_log_filename, writer, error_logger, num_parallel): + """ + generic function to log the notebook/directory ACLs to specific file names + :param artifact_type: set('notebooks', 'directories') ACLs to be logged + :param read_log_filename: the list of the notebook paths / object ids + :param write_log_filename: output file to store object_id acls + :param error_logger: logger to log errors + """ + read_log_path = self.get_export_dir() + read_log_filename + if not os.path.exists(read_log_path): + logging.info(f"No log exists for {read_log_path}. Skipping ACL export ...") + return + + def _acl_log_helper(json_data): + data = json.loads(json_data) + obj_id = data.get('object_id', None) + alt_id = data.get('id', None) + + if alt_id and not obj_id: + obj_id = alt_id + + api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id) + acl_resp = self.get(api_endpoint) + acl_resp['path'] = data.get('path') + if logging_utils.log_response_error(error_logger, acl_resp): + return + acl_resp.pop('http_status_code') + writer.write(json.dumps(acl_resp) + '\n') + + with open(read_log_path, 'r', encoding='utf-8') as read_fp: + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(_acl_log_helper, json_data) for json_data in read_fp] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', + dir_log_file='user_dirs.log', + repo_log_file="repos.log", + num_parallel=4): + """ + loop through all notebooks and directories to store their associated ACLs + :param workspace_log_file: input file for user notebook listing + :param dir_log_file: input file for user directory listing + :param repo_log_file: input file for repo listing + """ + # define log file names for notebooks, folders, and libraries + logging.info("Exporting the notebook permissions") + start = timer() + acl_notebooks_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir()) + acl_notebooks_writer = ThreadSafeWriter(self.get_export_dir() + "acl_notebooks.log", "w") + try: + self.log_acl_to_file('notebooks', workspace_log_file, acl_notebooks_writer, acl_notebooks_error_logger, num_parallel) + finally: + acl_notebooks_writer.close() + end = timer() + logging.info("Complete Notebook ACLs Export Time: " + str(timedelta(seconds=end - start))) + + logging.info("Exporting the directories permissions") + start = timer() + acl_directory_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir()) + acl_directory_writer = ThreadSafeWriter(self.get_export_dir() + "acl_directories.log", "w") + try: + self.log_acl_to_file('directories', dir_log_file, acl_directory_writer, acl_directory_error_logger, num_parallel) + finally: + acl_directory_writer.close() + end = timer() + logging.info("Complete Directories ACLs Export Time: " + str(timedelta(seconds=end - start))) + + logging.info("Exporting the repo permissions") + start = timer() + acl_repo_error_logger = logging_utils.get_error_logger( + wmconstants.WM_EXPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT, self.get_export_dir()) + acl_repo_writer = ThreadSafeWriter(self.get_export_dir() + "acl_repos.log", "w") + try: + self.log_acl_to_file('repos', repo_log_file, acl_repo_writer, acl_repo_error_logger, + num_parallel) + finally: + acl_repo_writer.close() + end = timer() + logging.info("Complete Repo ACLs Export Time: " + str(timedelta(seconds=end - start))) + + def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): + """ + apply the acl definition to the workspace object + object_id comes from the export data which contains '/type/id' format for this key + the object_id contains the {{/type/object_id}} format which helps craft the api endpoint + setting acl definitions uses the patch rest api verb + :param acl_str: the complete string from the logfile. contains object defn and acl lists + """ + object_acl = json.loads(acl_str) + # the object_type + object_type = object_acl.get('object_type', None) + obj_path = object_acl['path'] + logging.info(f"Working on ACL for path: {obj_path}") + + if not checkpoint_key_set.contains(obj_path): + # We cannot modify '/Shared' directory's ACL + if obj_path == "/Shared" and object_type == "directory": + logging.info("We cannot modify /Shared directory's ACL. Skipping..") + checkpoint_key_set.write(obj_path) + return + + if self.is_user_ws_item(obj_path): + ws_user = self.get_user(obj_path) + if not self.does_user_exist(ws_user): + logging.info(f"User workspace does not exist: {obj_path}, skipping ACL") + return + obj_status = self.get(WS_STATUS, {'path': obj_path}) + + if self.is_repo(obj_path): + if logging_utils.check_error(obj_status): + logging.warning(f"Could not apply ACL to repo {obj_path}") + return + + if logging_utils.log_response_error(error_logger, obj_status): + return + logging.info("ws-stat: ", obj_status) + current_obj_id = obj_status.get('object_id', None) + if not current_obj_id: + error_logger.error(f'Object id missing from destination workspace: {obj_status}') + return + if object_type == 'directory': + object_id_with_type = f'/directories/{current_obj_id}' + elif object_type == 'notebook': + object_id_with_type = f'/notebooks/{current_obj_id}' + else: + error_logger.error(f'Object for Workspace ACLs is Undefined: {obj_status}') + return + api_path = '/permissions' + object_id_with_type + acl_list = object_acl.get('access_control_list', None) + access_control_list = self.build_acl_args(acl_list) + if access_control_list: + api_args = {'access_control_list': access_control_list} + resp = self.patch(api_path, api_args) + + # if skipping non-existing users, add error code to allowlist + ignore_error_list = wmconstants.IGNORE_ERROR_LIST + if self.skip_missing_users: + ignore_error_list.append("RESOURCE_DOES_NOT_EXIST") + + if logging_utils.check_error(resp, ignore_error_list): + logging_utils.log_response_error(error_logger, resp) + else: + checkpoint_key_set.write(obj_path) + return + + def import_workspace_acls(self, workspace_log_file='acl_notebooks.log', + dir_log_file='acl_directories.log', + repo_log_file='acl_repos.log', num_parallel=1): + """ + import the notebook and directory acls by looping over notebook and dir logfiles + """ + dir_acl_logs = self.get_export_dir() + dir_log_file + notebook_acl_logs = self.get_export_dir() + workspace_log_file + repo_acl_logs = self.get_export_dir() + repo_log_file + + acl_notebooks_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir()) + + checkpoint_notebook_acl_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT) + with open(notebook_acl_logs, encoding="utf-8") as nb_acls_fp: + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(self.apply_acl_on_object, nb_acl_str, acl_notebooks_error_logger, checkpoint_notebook_acl_set) for nb_acl_str in nb_acls_fp] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + acl_dir_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir()) + checkpoint_dir_acl_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT) + + with open(dir_acl_logs, encoding='utf-8') as dir_acls_fp: + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(self.apply_acl_on_object, dir_acl_str, acl_dir_error_logger, checkpoint_dir_acl_set) for dir_acl_str in dir_acls_fp] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + acl_repo_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT, self.get_export_dir()) + checkpoint_repo_acl_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT) + + with open(repo_acl_logs, encoding='utf-8') as repo_acls_fp: + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [ + executor.submit(self.apply_acl_on_object, repo_acl_str, acl_repo_error_logger, checkpoint_repo_acl_set) + for repo_acl_str in repo_acls_fp] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + print("Completed import ACLs of Repos, Notebooks and Directories") + + def get_current_users(self): + """ + get the num of defined user home directories in the new workspace + if this is 0, we must create the users before importing the notebooks over. + we cannot create the users home directory since its a special type of directory + """ + ws_users = self.get(WS_LIST, {'path': '/Users/'}).get('objects', None) + if ws_users: + return len(ws_users) + else: + return 0 + + def does_user_exist(self, username): + """ + check if the users home dir exists + """ + stat = self.get(WS_STATUS, {'path': '/Users/{0}'.format(username)}) + if stat.get('object_type', None) == 'DIRECTORY': + return True + return False + + def does_path_exist(self, dir_path): + status_resp = self.get(WS_STATUS, {'path': dir_path}) + if 'error_code' in status_resp: + if status_resp.get('error_code') == 'RESOURCE_DOES_NOT_EXIST': + return False + else: + print('Failure:' + json.dumps(status_resp)) + return False + return True + + def import_current_workspace_items(self, artifact_dir='artifacts/'): + src_dir = self.get_export_dir() + artifact_dir + error_logger = logging_utils.get_error_logger(wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT, + self.get_export_dir()) + for root, subdirs, files in self.walk(src_dir): + # replace the local directory with empty string to get the notebook workspace directory + nb_dir = '/' + root.replace(src_dir, '') + upload_dir = nb_dir + if not nb_dir == '/': + upload_dir = nb_dir + '/' + if not self.does_path_exist(upload_dir): + resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) + if 'error_code' in resp_mkdirs: + logging_utils.log_response_error(error_logger, resp_mkdirs) + for f in files: + logging.info("Uploading: {0}".format(f)) + # create the local file path to load the DBC file + local_file_path = os.path.join(root, f) + # create the ws full file path including filename + ws_file_path = upload_dir + f + # generate json args with binary data for notebook to upload to the workspace path + nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) + # call import to the workspace + if self.is_verbose(): + logging.info("Path: {0}".format(nb_input_args['path'])) + resp_upload = self.post(WS_IMPORT, nb_input_args) + if 'error_code' in resp_upload: + resp_upload['path'] = nb_input_args['path'] + logging_utils.log_response_error(error_logger, resp_upload) + + def import_all_workspace_items(self, artifact_dir='artifacts/', + archive_missing=False, num_parallel=4, last_session=""): + """ + import all notebooks into a new workspace. Walks the entire artifacts/ directory in parallel, and also + upload all the files in each of the directories in parallel. + + WARNING: Because it parallelizes both on directory walking and file uploading, it can spawn as many threads as + num_parallel * num_parallel + + :param artifact_dir: notebook download directory + :param failed_log: failed import log + :param archive_missing: whether to put missing users into a /Archive/ top level directory + :param last_session: a previous session against which the current session will be compared. Only the changed ahd new notebooks will be imported if last_session is defiined. + """ + src_dir = self.get_export_dir() + artifact_dir + error_logger = logging_utils.get_error_logger(wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT, + self.get_export_dir()) + + # Given previous exported artifacts, a list of changed and newly added notebooks will be logged at notebook_changes.log + changes_since_last = set() + if last_session: + nb_changes_log = os.path.join(self.get_export_dir(), "notebook_changes.log") + base_dir = os.path.split(os.path.normpath(self.get_export_dir()))[0] + last_src_dir = os.path.join(base_dir, last_session, artifact_dir) + changes_since_last = get_updated_new_files(last_src_dir, src_dir) + log_file_changes(changes_since_last, nb_changes_log) + + checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT) + num_exported_users = self.get_num_of_saved_users(src_dir) + num_current_users = self.get_current_users() + if num_current_users == 0: + logging.info("No registered users in existing environment. Please import users / groups first.") + raise ValueError("No registered users in the current environment") + if (num_current_users < num_exported_users) and (not archive_missing): + logging.info("Exported number of user workspaces: {0}".format(num_exported_users)) + logging.info("Current number of user workspaces: {0}".format(num_current_users)) + logging.info("Re-run with the `--archive-missing` flag to load missing users into a separate directory") + raise ValueError("Current number of users is less than number of user workspaces to import.") + archive_users = set() + + def _upload_all_files(root, subdirs, files): + ''' + Upload all files in parallel in root (current) directory. + ''' + # replace the local directory with empty string to get the notebook workspace directory + nb_dir = '/' + root.replace(src_dir, '') + upload_dir = nb_dir + if not nb_dir == '/': + upload_dir = nb_dir + '/' + if self.is_user_ws_item(upload_dir): + ws_user = self.get_user(upload_dir) + if archive_missing: + if ws_user in archive_users: + upload_dir = upload_dir.replace('Users', 'Archive', 1) + elif not self.does_user_exist(ws_user): + # add the user to the cache / set of missing users + logging.info("User workspace does not exist, adding to archive cache: {0}".format(ws_user)) + archive_users.add(ws_user) + # append the archive path to the upload directory + upload_dir = upload_dir.replace('Users', 'Archive', 1) + else: + logging.info("User workspace exists: {0}".format(ws_user)) + elif not self.does_user_exist(ws_user): + logging.info("User {0} is missing. " + "Please re-run with --archive-missing flag " + "or first verify all users exist in the new workspace".format(ws_user)) + return + else: + logging.info("Uploading for user: {0}".format(ws_user)) + # make the top level folder before uploading files within the loop + if not self.is_user_ws_root(upload_dir): + # if it is not the /Users/example@example.com/ root path, don't create the folder + resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir}) + if 'error_code' in resp_mkdirs: + resp_mkdirs['path'] = upload_dir + logging_utils.log_response_error(error_logger, resp_mkdirs) + + def _file_upload_helper(f): + logging.info("Uploading: {0}".format(f)) + # create the local file path to load the DBC file + local_file_path = os.path.join(root, f) + # create the ws full file path including filename + ws_file_path = upload_dir + f + if checkpoint_notebook_set.contains(ws_file_path): + return + if changes_since_last: + if local_file_path not in changes_since_last: + print(f"Skipping {f} because it has not been changed.") + return + else: + print(f"Importing {f} because it has been changed.") + # generate json args with binary data for notebook to upload to the workspace path + nb_input_args = self.get_user_import_args(local_file_path, ws_file_path) + # call import to the workspace + if self.is_verbose(): + logging.info("Path: {0}".format(nb_input_args['path'])) + resp_upload = self.post(WS_IMPORT, nb_input_args) + if 'error_code' in resp_upload: + resp_upload['path'] = ws_file_path + logging.info(f'Error uploading file: {ws_file_path}') + logging_utils.log_response_error(error_logger, resp_upload) + else: + checkpoint_notebook_set.write(ws_file_path) + + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(_file_upload_helper, file) for file in files] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [executor.submit(_upload_all_files, walk[0], walk[1], walk[2]) for walk in self.walk(src_dir)] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + def import_all_repos(self, repo_log_file="repos.log", num_parallel=1): + dir_repo_logs = self.get_export_dir() + repo_log_file + + # check to see if git creds are set up- repo import will fail if not + git_cred_api_path = "/git-credentials" + resp = self.get(git_cred_api_path) + if not resp.get("credentials", None): + logging.info("Repo import will be skipped; repos can only be imported if Git credentials are first set up.") + logging.info("To import repos separately, please run repo_importer.py") + return + + repo_error_logger = logging_utils.get_error_logger( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_OBJECT, self.get_export_dir()) + checkpoint_repo_set = self._checkpoint_service.get_checkpoint_key_set( + wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_OBJECT) + + with open(dir_repo_logs, encoding='utf-8') as repo_fp: + with ThreadPoolExecutor(max_workers=num_parallel) as executor: + futures = [ + executor.submit(self.create_repo, repo_str, repo_error_logger, + checkpoint_repo_set) + for repo_str in repo_fp] + concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") + propagate_exceptions(futures) + + def create_repo(self, repo_str, error_logger, checkpoint_repo_set): + api_path = '/repos' + repo_json = json.loads(repo_str) + repo_url = repo_json.get('url', None) + if repo_url: + logging.info("Repo: {0}".format(repo_json.get('path', ''))) + resp = self.post(api_path, repo_json) + if (resp.get('error_code') == "RESOURCE_DOES_NOT_EXIST") and \ + (resp.get('http_status_code') == 404): + parent_directory = re.sub(r"^RESOURCE_DOES_NOT_EXIST: Parent directory ", '', resp.get('message')) + parent_directory = re.sub(r" does not exist.$", '', parent_directory) + if re.fullmatch( + r'/Repos/.+[^/]', parent_directory + ): + logging.info(f"Creating parent directory {parent_directory}") + resp2 = self.post('/workspace/mkdirs', {"path": parent_directory}) + if logging_utils.check_error(resp2): + logging_utils.log_response_error(error_logger, resp2) + else: + logging.info(f"2nd attempt to create: {repo_json.get('path', '')}") + resp = self.post(api_path, repo_json) + if logging_utils.check_error(resp): + logging_utils.log_response_error(error_logger, resp) + else: + checkpoint_repo_set.write(repo_url) + else: + logging.info(f"Could not import repo {repo_json.get('path', '')}; only remote repos can be created via API.") From e2e3a434a552d8d1f67d90a7e41607a2e34e1967 Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Fri, 21 Jun 2024 15:22:07 -0400 Subject: [PATCH 095/111] Update WorkspaceClient.py --- dbclient/WorkspaceClient.py | 74 ++++++++----------------------------- 1 file changed, 16 insertions(+), 58 deletions(-) diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py index 5bbcca7a..e39fd405 100644 --- a/dbclient/WorkspaceClient.py +++ b/dbclient/WorkspaceClient.py @@ -29,10 +29,6 @@ def __init__(self, configs, checkpoint_service): self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) self.skip_missing_users = configs['skip_missing_users'] - self.skip_large_nb = configs['skip_large_nb'] - self.get_user_group = False - self.users_target = [] - self.groups_target = [] _languages = {'.py': 'PYTHON', '.scala': 'SCALA', @@ -354,11 +350,8 @@ def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error logging_utils.log_response_error(error_logger, resp) return resp if resp.get('error_code', None): - if self.skip_large_nb and resp.get("message", None) == 'Size exceeds 10485760 bytes': - logging.info("Notebook {} skipped due to size exceeding limit".format(notebook_path)) - else: - resp['path'] = notebook_path - logging_utils.log_response_error(error_logger, resp) + resp['path'] = notebook_path + logging_utils.log_response_error(error_logger, resp) return resp nb_path = os.path.dirname(notebook_path) if nb_path != '/': @@ -376,7 +369,7 @@ def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error logging.warning(f"Notebook file {save_filename} already exists; please rename in source workspace. " f"Note that files are case-insensitive") return {} - + logging.info(save_filename) with open(save_filename, "wb") as f: f.write(base64.b64decode(resp['content'])) checkpoint_notebook_set.write(notebook_path) @@ -420,7 +413,7 @@ def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_wo workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + workspace_log_file, "a") libs_log_writer = ThreadSafeWriter(self.get_export_dir() + libs_log_file, "a") dir_log_writer = ThreadSafeWriter(self.get_export_dir() + dir_log_file, "a") - repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a") + #repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a") checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set( wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT ) @@ -429,14 +422,14 @@ def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_wo workspace_log_writer=workspace_log_writer, libs_log_writer=libs_log_writer, dir_log_writer=dir_log_writer, - repos_log_writer=repos_log_writer, + repos_log_writer=None, checkpoint_set=checkpoint_item_log_set, exclude_prefixes=exclude_prefixes) finally: workspace_log_writer.close() libs_log_writer.close() dir_log_writer.close() - repos_log_writer.close() + #repos_log_writer.close() return num_nbs @@ -459,7 +452,7 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer if not os.path.exists(self.get_export_dir()): os.makedirs(self.get_export_dir(), exist_ok=True) items = self.get(WS_LIST, get_args).get('objects', None) - repos = self.get(REPOS).get('repos', None) + #repos = self.get(REPOS).get('repos', None) num_nbs = 0 if self.is_verbose(): logging.info("Listing: {0}".format(get_args['path'])) @@ -483,6 +476,7 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer if self.is_verbose(): logging.info("Skipped notebook path due to group exclusion: {0}".format(x.get('path'))) continue + if not checkpoint_set.contains(nb_path) and not nb_path.startswith(tuple(exclude_prefixes)): if self.is_verbose(): logging.info("Saving path: {0}".format(x.get('path'))) @@ -530,19 +524,20 @@ def _recurse_log_all_workspace_items(folder): if self.is_verbose(): logging.info("Skipped directory due to group exclusion: {0}".format(dir_path)) continue - + if not checkpoint_set.contains(dir_path) and not dir_path.startswith(tuple(exclude_prefixes)): num_nbs_plus = _recurse_log_all_workspace_items(folder) checkpoint_set.write(dir_path) if num_nbs_plus: num_nbs += num_nbs_plus # log all repos - if repos_log_writer and repos: - for repo in repos: - repo_path = repo.get('path', "") - if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)): - repos_log_writer.write(json.dumps(repo) + '\n') - checkpoint_set.write(repo_path) + + # if repos_log_writer and repos: + # for repo in repos: + # repo_path = repo.get('path', "") + # if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)): + # repos_log_writer.write(json.dumps(repo) + '\n') + # checkpoint_set.write(repo_path) return num_nbs @@ -585,15 +580,6 @@ def _acl_log_helper(json_data): futures = [executor.submit(_acl_log_helper, json_data) for json_data in read_fp] concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION") propagate_exceptions(futures) - - def get_users_groups_target(self): - users = self.get('/preview/scim/v2/Users?attributes=userName').get('Resources', []) - groups = self.get('/preview/scim/v2/Groups').get('Resources', []) - - users = [i['userName'] for i in users] - groups = [i['displayName'] for i in groups] - - return (users, groups) def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', dir_log_file='user_dirs.log', @@ -643,26 +629,6 @@ def log_all_workspace_acls(self, workspace_log_file='user_workspace.log', end = timer() logging.info("Complete Repo ACLs Export Time: " + str(timedelta(seconds=end - start))) - def fix_acls(self, acl, groups_target, users_target): - new_acls = [] - for permission in acl: - try: - group_name = permission.get('group_name', None) - user_name = permission.get('user_name', None) - if group_name != None and group_name in groups_target: - new_acls.append(permission) - elif user_name != None and user_name in users_target: - new_acls.append(permission) - elif group_name != None and group_name not in groups_target: - logging.error(f"Group name {group_name} not found in target workspace, removing ACLs {permission}") - elif user_name != None and user_name not in users_target: - logging.error(f"User name {user_name} not found in target workspace, removing ACLs {permission}") - else: - logging.error(f"User name {user_name} or group name {group_name} has errors for ACLs {permission}") - except Exception as e: - logging.error(f"Failed at filtering permissions: {str(e)}") - return new_acls - def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): """ apply the acl definition to the workspace object @@ -677,13 +643,6 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): obj_path = object_acl['path'] logging.info(f"Working on ACL for path: {obj_path}") - if not self.get_user_group: - logging.info(f"self.get_user_group: {self.get_user_group}") - users_target, groups_target = self.get_users_groups_target() - self.users_target = users_target - self.groups_target = groups_target - self.get_user_group = True - if not checkpoint_key_set.contains(obj_path): # We cannot modify '/Shared' directory's ACL if obj_path == "/Shared" and object_type == "directory": @@ -721,7 +680,6 @@ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set): acl_list = object_acl.get('access_control_list', None) access_control_list = self.build_acl_args(acl_list) if access_control_list: - access_control_list = self.fix_acls(access_control_list, self.groups_target, self.users_target) api_args = {'access_control_list': access_control_list} resp = self.patch(api_path, api_args) From c05a6f87ef599590a66edcbf33a185a5a4dc470c Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Mon, 22 Jul 2024 12:28:08 -0400 Subject: [PATCH 096/111] Search and Replace in File Have to cd (change dir) to where files are for it to work. --- utils/search_and_replace.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 utils/search_and_replace.py diff --git a/utils/search_and_replace.py b/utils/search_and_replace.py new file mode 100644 index 00000000..49799214 --- /dev/null +++ b/utils/search_and_replace.py @@ -0,0 +1,23 @@ +import os + +print("Directory to Loop Through: ") +basePath = input("> ") + +print("Text to find: ") +texToFind = input("> ") + +print('Text to us as replacement: ') +replacementText = input(r'> ') + +directory = os.fsencode(basePath) + +for file in os.listdir(directory): + fileName = os.fsdecode(file) + with open(fileName, 'r') as file: + filedata = file.read() + + filedata = filedata.replace(texToFind, replacementText) + + print(f'Replacing occurences of {texToFind} with {replacementText} in file {fileName}') + with open(fileName, 'w') as file: + file.write(filedata) \ No newline at end of file From 92df31832c85ae38c4955fc57dbee7cbcc3b0c98 Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Mon, 29 Jul 2024 14:11:02 -0400 Subject: [PATCH 097/111] Add files via upload --- utils/force_fix_schema.py | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 utils/force_fix_schema.py diff --git a/utils/force_fix_schema.py b/utils/force_fix_schema.py new file mode 100644 index 00000000..670f4a95 --- /dev/null +++ b/utils/force_fix_schema.py @@ -0,0 +1,49 @@ +import pandas as pd +import numpy as np +import re +import os + +print("Directory to Loop Through: ") +basePath = input("> ") +print("Catalog to remove from DDL: ") +namespace = input("> ") + +folderName = os.path.basename(basePath) + +def fix_schema_errors(basePath: str, namespace: str): + print('\n') + print(f'Applying schema mismatch fixes to {folderName} table.') + loc_pattern = "LOCATION '.*'" + tbl_pattern = "TBLPROPERTIES.*" + # ddl_pattern = "\([^()]*\)" + + + print(f'Working on: {folderName} ...') + directory = os.fsencode(basePath) + + for file in os.listdir(directory): + fileName = os.fsdecode(file) + print(fileName) + try: + with open(fileName, "r") as f: + print(f"Opened file {fileName}") + ddl = f.read() + print(ddl) + x = re.search(loc_pattern, ddl) + + if x: + print(f"Removing {namespace} from Create Statement") + ddl = re.sub(f'{namespace}.', '', ddl) + print('Removing schema definition from ddl') + ddl = re.sub(r'\([^()]*\)', '', ddl) + if re.search(tbl_pattern, ddl): + ddl = re.sub(tbl_pattern, '', ddl) + else: + print(f'No Location in DDL in {fileName}, skipping...') + with open(fileName, 'w') as file: + file.write(ddl) + + except AttributeError: + print('Failure') + +fix_schema_errors(basePath, namespace) \ No newline at end of file From c2ca4f5da053c918c088eaf128ce36c31638aca1 Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Tue, 30 Jul 2024 11:19:17 -0400 Subject: [PATCH 098/111] Add files via upload --- utils/force_fix_schema.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/force_fix_schema.py b/utils/force_fix_schema.py index 670f4a95..e49d792c 100644 --- a/utils/force_fix_schema.py +++ b/utils/force_fix_schema.py @@ -30,7 +30,10 @@ def fix_schema_errors(basePath: str, namespace: str): ddl = f.read() print(ddl) x = re.search(loc_pattern, ddl) - + print(f"Removing {namespace} from Create Statement") + ddl = re.sub(f'{namespace}.', '', ddl) + + if x: print(f"Removing {namespace} from Create Statement") ddl = re.sub(f'{namespace}.', '', ddl) From 5beb31c7b8ee1e1fd0ecf861f4a17e23177ea3ae Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Thu, 1 Aug 2024 11:09:10 -0400 Subject: [PATCH 099/111] Add files via upload --- utils/force_fix_schema.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/utils/force_fix_schema.py b/utils/force_fix_schema.py index e49d792c..62402c4c 100644 --- a/utils/force_fix_schema.py +++ b/utils/force_fix_schema.py @@ -29,20 +29,21 @@ def fix_schema_errors(basePath: str, namespace: str): print(f"Opened file {fileName}") ddl = f.read() print(ddl) - x = re.search(loc_pattern, ddl) + # x = re.search(loc_pattern, ddl) print(f"Removing {namespace} from Create Statement") ddl = re.sub(f'{namespace}.', '', ddl) - - - if x: - print(f"Removing {namespace} from Create Statement") - ddl = re.sub(f'{namespace}.', '', ddl) - print('Removing schema definition from ddl') - ddl = re.sub(r'\([^()]*\)', '', ddl) - if re.search(tbl_pattern, ddl): - ddl = re.sub(tbl_pattern, '', ddl) - else: - print(f'No Location in DDL in {fileName}, skipping...') + ddl = re.sub(r'\([^()]*\)', '', ddl) + ddl = re.sub(tbl_pattern, '', ddl) + + # if x: + # print(f"Removing {namespace} from Create Statement") + # ddl = re.sub(f'{namespace}.', '', ddl) + # print('Removing schema definition from ddl') + # ddl = re.sub(r'\([^()]*\)', '', ddl) + # if re.search(tbl_pattern, ddl): + # ddl = re.sub(tbl_pattern, '', ddl) + # else: + # print(f'No Location in DDL in {fileName}, skipping...') with open(fileName, 'w') as file: file.write(ddl) From 0c74677aaa25c4e1ead8f5a1a71dba9b7b9fee7c Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Thu, 1 Aug 2024 13:43:10 -0400 Subject: [PATCH 100/111] Add files via upload --- utils/force_fix_schema_location_check.py | 53 ++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 utils/force_fix_schema_location_check.py diff --git a/utils/force_fix_schema_location_check.py b/utils/force_fix_schema_location_check.py new file mode 100644 index 00000000..3a50cb86 --- /dev/null +++ b/utils/force_fix_schema_location_check.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np +import re +import os + +print("Directory to Loop Through: ") +basePath = input("> ") +print("Catalog to remove from DDL: ") +namespace = input("> ") + +folderName = os.path.basename(basePath) + +def fix_schema_errors(basePath: str, namespace: str): + print('\n') + print(f'Applying schema mismatch fixes to {folderName} table.') + loc_pattern = "LOCATION '.*'" + tbl_pattern = "TBLPROPERTIES.*" + # ddl_pattern = "\([^()]*\)" + + + print(f'Working on: {folderName} ...') + directory = os.fsencode(basePath) + + for file in os.listdir(directory): + fileName = os.fsdecode(file) + print(fileName) + try: + with open(fileName, "r") as f: + print(f"Opened file {fileName}") + ddl = f.read() + print(ddl) + x = re.search(loc_pattern, ddl) + print(f"Removing {namespace} from Create Statement") + # ddl = re.sub(f'{namespace}.', '', ddl) + # ddl = re.sub(r'\([^()]*\)', '', ddl) + # ddl = re.sub(tbl_pattern, '', ddl) + + if x: + print(f"Removing {namespace} from Create Statement") + ddl = re.sub(f'{namespace}.', '', ddl) + print('Removing schema definition from ddl') + ddl = re.sub(r'\([^()]*\)', '', ddl) + if re.search(tbl_pattern, ddl): + ddl = re.sub(tbl_pattern, '', ddl) + else: + print(f'No Location in DDL in {fileName}, skipping...') + with open(fileName, 'w') as file: + file.write(ddl) + + except AttributeError: + print('Failure') + +fix_schema_errors(basePath, namespace) \ No newline at end of file From 6a24baaaa739520e08e442dd0119719419517a7d Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Thu, 15 Aug 2024 11:40:59 -0400 Subject: [PATCH 101/111] Add files via upload --- utils/HMS_Modification_Get_Database.py | 110 +++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 utils/HMS_Modification_Get_Database.py diff --git a/utils/HMS_Modification_Get_Database.py b/utils/HMS_Modification_Get_Database.py new file mode 100644 index 00000000..780c1e77 --- /dev/null +++ b/utils/HMS_Modification_Get_Database.py @@ -0,0 +1,110 @@ +import os +import argparse +import json + +class MetastoreUpdater: + + def __init__(self, metastore_logs, root_bucket, mount_point, database_details_log): + self.metastore_logs = metastore_logs + self.root_bucket = root_bucket + self.database_details_log = database_details_log + if mount_point: + self.mount_point = mount_point + else: + self.mount_point = False + self.errors = {} + self.updated_ddls = {} + + def duplicate_metastore_as_backup(self): + # Get the path up one level from self.metastore_logs + backup_dir = os.path.join(os.path.dirname(self.metastore_logs), 'metastore_backup') + os.makedirs(backup_dir, exist_ok=True) + + for i in os.listdir(self.metastore_logs): + if i not in ['backup', '.DS_Store', '.ipynb_checkpoints']: + os.system(f"cp -r {os.path.join(self.metastore_logs, i)} {backup_dir}") + + def get_database_details_log(self): + # get the database details log + with open(self.database_details_log, 'r') as f: + db_details = f.read() + + # split the log by new line + db_details = db_details.split('\n') + + # get the database details + database_details = {} + for db in db_details: + try: + db = json.loads(db) + db_name = db['Namespace Name'] + db_location = db['Location'] + database_details[db_name] = db_location + except json.decoder.JSONDecodeError: + print("Error decoding JSON for database:", db) + + return database_details + + + def update_metastore(self): + db_list = [i for i in os.listdir(self.metastore_logs) if i not in ['.DS_Store', '.ipynb_checkpoints']] + + for db in db_list: + db_path = os.path.join(self.metastore_logs, db) + + table_list = [i for i in os.listdir(db_path) if i not in ['.DS_Store', '.ipynb_checkpoints']] + + for table in table_list: + table_path = os.path.join(db_path, table) + + with open(table_path, 'r') as f: + ddl = f.read() + + if "location '" in ddl.lower(): + self.errors[db + table] = "location found in ddl" + ddl + continue + + if "create view" in ddl.lower(): + self.errors[db + table] = "create view found in ddl" + ddl + continue + + if db != 'default': + db_details_dict = self.get_database_details_log() + if db in db_details_dict: + location = db_details_dict[db] + "/" + table + else: + print(f"ERROR: Database {db} not found in database details log") + + new_ddl = ddl + "\nLOCATION '" + location + "'" + + with open(table_path, 'w') as f: + f.write(new_ddl) + + self.updated_ddls[db + table] = new_ddl + + def analyze_performance(self): + # Print the number of tables updated + print(f"Number of tables updated: {len(self.updated_ddls)}") + # Print the number of errors + print(f"Number of errors: {len(self.errors)}") + # Print the errors with create view found in ddl + print("Number of view errors: ", len([i for i in self.errors.values() if "create view found in ddl" in i])) + # Print the errors with location found in ddl + print("Number of location errors: ", len([i for i in self.errors.values() if "location found in ddl" in i])) + + +def parser(): + parser = argparse.ArgumentParser(description='Update metastore logs') + parser.add_argument('--metastore_logs', type=str, help='Path to metastore logs', required=True) + parser.add_argument('--root_bucket', type=str, help='Root bucket name', required=False) + parser.add_argument('--mount_point', type=str, help='Mount point', required=False) + parser.add_argument('--database_details_log', type=str, help='Database details log', required=False) + args = parser.parse_args() + return args + +if __name__ == '__main__': + args = parser() + updater = MetastoreUpdater(args.metastore_logs, args.root_bucket, args.mount_point, args.database_details_log) + updater.duplicate_metastore_as_backup() + updater.update_metastore() + updater.analyze_performance() \ No newline at end of file From 984cfe029b4664a0d979c8b6f8a96ee0ebcf142b Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Thu, 15 Aug 2024 12:18:05 -0400 Subject: [PATCH 102/111] Add files via upload --- utils/HMS_Modification_Get_Database.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/HMS_Modification_Get_Database.py b/utils/HMS_Modification_Get_Database.py index 780c1e77..59247589 100644 --- a/utils/HMS_Modification_Get_Database.py +++ b/utils/HMS_Modification_Get_Database.py @@ -42,6 +42,7 @@ def get_database_details_log(self): database_details[db_name] = db_location except json.decoder.JSONDecodeError: print("Error decoding JSON for database:", db) + continue return database_details From 5ba7c51b5651870951c7313f65eacf076ca01376 Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Tue, 20 Aug 2024 08:27:00 -0400 Subject: [PATCH 103/111] Add files via upload --- utils/ff_view_tblprop.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 utils/ff_view_tblprop.py diff --git a/utils/ff_view_tblprop.py b/utils/ff_view_tblprop.py new file mode 100644 index 00000000..e53bd08a --- /dev/null +++ b/utils/ff_view_tblprop.py @@ -0,0 +1,36 @@ +import pandas as pd +import numpy as np +import re +import os + +print("Directory to Loop Through: ") +basePath = input("> ") + +folderName = os.path.basename(basePath) + +def fix_schema_errors(basePath: str): + + tbl_pattern = r"TBLPROPERTIES \([^()]*\)" + # ddl_pattern = "\([^()]*\)" + + + print(f'Working on: {folderName} ...') + directory = os.fsencode(basePath) + + for file in os.listdir(directory): + fileName = os.fsdecode(file) + print(fileName) + try: + with open(fileName, "r") as f: + print(f"Opened file {fileName}") + ddl = f.read() + print(ddl) + if re.search(tbl_pattern, ddl): + ddl = re.sub(tbl_pattern, '', ddl) + with open(fileName, 'w') as file: + file.write(ddl) + + except AttributeError: + print('Failure') + +fix_schema_errors(basePath) \ No newline at end of file From 0484a694a3d5fd9a70986e73d175e01a37b98bd7 Mon Sep 17 00:00:00 2001 From: cbartholomew2 <89409387+cbartholomew2@users.noreply.github.com> Date: Tue, 27 Aug 2024 13:11:17 -0400 Subject: [PATCH 104/111] Add files via upload --- utils/create_sample_jobs_new.py | 87 +++++++++++ utils/databricks_delete.py | 266 ++++++++++++++++++++++++++++++++ utils/jobs_dbr_modification.py | 76 +++++++++ 3 files changed, 429 insertions(+) create mode 100644 utils/create_sample_jobs_new.py create mode 100644 utils/databricks_delete.py create mode 100644 utils/jobs_dbr_modification.py diff --git a/utils/create_sample_jobs_new.py b/utils/create_sample_jobs_new.py new file mode 100644 index 00000000..03740725 --- /dev/null +++ b/utils/create_sample_jobs_new.py @@ -0,0 +1,87 @@ +import json +import pandas as pd +import csv +import os +import datetime +import argparse + +def read_log(file_name): + try: + with open("./all_jobs/"+file_name) as f: + data = f.read().split("\n") + return data[:-1] + except FileNotFoundError as e: + return '' + except Exception as e: + print("Error while reading file:", file_name, "\n", e) + return '' + +def move_logs(timestamp=""): + # moving all_jobs + os.rename("jobs.log", f"./all_jobs/jobs{timestamp}.log") + os.rename("acl_jobs.log", f"./all_jobs/acl_jobs{timestamp}.log") + +def write_job_log(data, sample_job_ids): + with open("jobs.log", "w") as jl: + for d in data: + try: + d = json.loads(d) + if d['job_id'] in sample_job_ids: + jl.write(json.dumps(d) + "\n") + except: + print("Error while writing jobs.log") + + +def write_job_acls_log(data, sample_job_ids): + with open("acl_jobs.log", "w") as jal: + for d in data: + try: + d = json.loads(d) + if int(d['object_id'].split("/")[-1]) in sample_job_ids: + jal.write(json.dumps(d) + "\n") + except: + print("Error while writing acl_jobs.log") + +def write_rest_job_logs(jobslog, acljobslog, sample_job_ids): + with open("other_jobs.log", "w") as ojl: + for d in jobslog: + try: + d = json.loads(d) + if d['job_id'] not in sample_job_ids: + ojl.write(json.dumps(d) + "\n") + except: + print("Error while writing other_jobs.log") + + with open("other_acl_jobs.log", "w") as ojal: + for d in acljobslog: + try: + d = json.loads(d) + if int(d['object_id'].split("/")[-1]) not in sample_job_ids: + ojal.write(json.dumps(d) + "\n") + except: + print("Error while writing other_acl_jobs.log") + +def main(): + + job_ids = [410104035299, 30596903773550, 97211745563636] + + if "all_jobs" not in os.listdir(): + os.mkdir("./all_jobs/") + move_logs() + elif "jobs.log" in os.listdir(): + ts = datetime.datetime.now() + move_logs("_"+str(ts)) + + #json objects + job_log_data = read_log("jobs.log") + job_acl_log_data = read_log("acl_jobs.log") + + #move jobs.log into ./alljobs folder + write sample jobs log in main logs folder + write_job_log(job_log_data, job_ids) + write_job_acls_log(job_acl_log_data, job_ids) + + #write jobs.log that only contains jobs NOT in sample jobs log + write_rest_job_logs(job_log_data, job_acl_log_data, job_ids) + +if __name__ == "__main__": + main() diff --git a/utils/databricks_delete.py b/utils/databricks_delete.py new file mode 100644 index 00000000..7453fb16 --- /dev/null +++ b/utils/databricks_delete.py @@ -0,0 +1,266 @@ +import argparse +import requests +import json +import sys +import time +import os +import datetime +import configparser +import re + + +class Databricks(object): + + def __init__(self, **kwargs): + profile = kwargs['profile'] if 'profile' in kwargs else 'DEFAULT' + login = self.get_login_credentials(profile) + url = login['host'] + token = login['token'] + self.host = self.url_validation(url) + self.token = token + print(f"Running on {self.host}") + self.check_file = kwargs['check_file'] if 'check_file' in kwargs else None + self.session = kwargs['session'] if 'session' in kwargs else None + self.retry_backoff = kwargs['retry_backoff'] if 'retry_backoff' in kwargs else 0.1 + + def progress(self, _cur, _max): + p = round(100*_cur/_max) + b = f"Progress: {_cur}/{_max}" + print(b, end="\r") + + def collect_jobs(self): + host = self.host + token = self.token + jobs_list = requests.get("{db_url}/api/2.0/jobs/list".format(db_url=host), headers={ + "Authorization": "Bearer {bearer_token}".format(bearer_token=token), + "Content-Type": "application/json"}) + log_file = "./logs/delete_jobs.log" + logger = open(log_file, 'w+') + logger.write("NEW RUN LOGGED: " + str(datetime.datetime.now()) + "\n") + logger.write("..." * 5 + "\n") + jobs = jobs_list.json()['jobs'] + job_ids = [] + job_ids = [{'job_id': job['job_id'], 'created_time': job['created_time']} for job in jobs] + job_ids = sorted(job_ids, key=lambda i: i['job_id']) + job_names_e2 = [job['settings']['name'] for job in jobs] + print("Total jobs: " + str(len(job_ids))) + logger.write("Total jobs: " + str(len(job_ids)) + "\n") + print("..." * 5, end="\r") + job_names = [] + if self.check_file: + with open(self.check_file) as f: + check_file = f.readlines() + + check_file = [x.split(',')[1] for x in check_file] + check_file = [x.strip() for x in check_file] + print("Total jobs to check: " + str(len(check_file))) + print("..." * 5, end="\r") + for job in jobs: + if job['settings']['name'] in check_file: + job_names.append(job['settings']['name']) + + skipped_jobs = [job for job in check_file if job not in job_names_e2] + print("Skipped jobs: " + str(len(skipped_jobs))) + job_ids = [{'job_id': job['job_id'], 'created_time': job['created_time']} for job in jobs if job['settings']['name'] in job_names] + logger.write("Total jobs to check: " + str(len(check_file)) + "\n") + logger.write("..." * 5 + "\n") + logger.write("Total jobs to delete: " + str(len(job_ids)) + "\n") + logger.write("..." * 5 + "\n") + logger.write("Not deleted jobs in E2: \n") + logger.write(','.join([json.dumps({'job_id': job['job_id'], 'job_name': job['settings']['name'], 'created_time': job['created_time']}) for job in jobs if job['settings']['name'] not in job_names])) + logger.write("\n") + logger.write("Deleted jobs in E2: \n") + logger.write(','.join([json.dumps({'job_id': job['job_id'], 'job_name': job['settings']['name'], 'created_time': job['created_time']}) for job in jobs if job['settings']['name'] in job_names])) + logger.write("\n") + logger.write("Check jobs not found in E2: \n") + logger.write(','.join(skipped_jobs)) + logger.close() + + print("Total jobs to delete: " + str(len(job_ids))) + print("List of job names to delete: " + str(job_names)) + user_response = input("Do you want to continue (y/n): ") + if str(user_response).lower() != 'y': + sys.exit(1) + return job_ids + + def collect_clusters(self): + host = self.host + token = self.token + clusters_list = requests.get("{db_url}/api/2.0/clusters/list".format(db_url=host), headers={ + "Authorization": "Bearer {bearer_token}".format(bearer_token=token), + "Content-Type": "application/json"}) + clusters = clusters_list.json()['clusters'] + cluster_ids = [{'cluster_id': cluster['cluster_id'], 'state': cluster['state']} for cluster in clusters] + cluster_ids = sorted(cluster_ids, key=lambda i: i['cluster_id']) + print("Total clusters: " + str(len(cluster_ids))) + print("..." * 5, end="\r") + return cluster_ids + + def delete_clusters(self): + host = self.host + token = self.token + + cluster_ids = self.collect_clusters() + output_file = f"./logs/{self.session}/delete_clusters.log" + fd = open(output_file, 'a+') + print("*" * 80, file=fd) + print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd) + print("cluster_id,status", file=fd) + cluster_num = 0 + cluster_max = len(cluster_ids) + for cluster_id in cluster_ids: + if cluster_id['state'] == 'RUNNING': + print("Cluster " + str(cluster_id['cluster_id']) + " is running. So not deleting this cluster") + self.progress(cluster_num, cluster_max) + cluster_num += 1 + continue + data = { + "cluster_id": "{cluster_id}".format(cluster_id=cluster_id['cluster_id']) + } + result = requests.post("{db_url}/api/2.0/clusters/delete".format(db_url=host), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"}, json=data) + print("{cluster_id},{status}".format(cluster_id=cluster_id, status=result.status_code), file=fd) + self.progress(cluster_num, cluster_max) + cluster_num += 1 + print("..." * 5, end="\r") + print("Done") + fd.close() + + def progress_bar(self, current, total, starttime, currenttime, barLength = 20): + percent = (current / total) * 100 + arrow = '-' * int(percent / 100 * barLength - 1) + '>' + spaces = ' ' * (barLength - len(arrow)) + # want to do two decimal points + time_elapsed = currenttime - starttime + time_remaining = (time_elapsed / (current + 1)) * (total - (current + 1)) + time_remaining_fmt = str(datetime.timedelta(seconds=time_remaining)) + print(f'Progress: [{arrow + spaces}] {percent:.2f}% Estimated time remaining: {time_remaining_fmt}', end='\r') + + def delete_jobs(self): + host = self.host + token = self.token + + job_ids = self.collect_jobs() + output_file = f"./logs/{self.session}/delete_jobs.log" + fd = open(output_file, 'a+') + print("*" * 80, file=fd) + print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd) + print("job_id,status", file=fd) + job_num = 0 + job_max = len(job_ids) + for job_id in job_ids: + job_runs = requests.get("{db_url}/api/2.0/jobs/runs/list?job_id={jobid}&active_only=true".format(db_url=host, jobid=job_id['job_id']), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"}) + if job_runs.status_code == 200 and "runs" in job_runs.json(): + print("Job " + str(job_id['job_id']) + " is active. So not deleting this job") + self.progress(job_num, job_max) + job_num += 1 + continue + data = { + "job_id": "{job_id}".format(job_id=job_id['job_id']) + } + result = requests.post("{db_url}/api/2.0/jobs/delete".format(db_url=host), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"}, json=data) + print("{job_id},{status}".format(job_id=job_id, status=result.status_code), file=fd) + self.progress(job_num, job_max) + job_num += 1 + print("..." * 5, end="\r") + print("Done") + fd.close() + + def read_log_file(self, log_file): + with open(log_file, 'r') as f: + return f.readlines() + + def delete_workspace_obj(self, path): + url = self.host + token = self.token + api_url = f"{url}/api/2.0/workspace/delete" + fd = open(f"./logs/{self.session}/delete_notebooks.log", 'a+') + print("Deleting: " + path, file=fd) + payload = {'path': path} + headers = { + 'Authorization': f'Bearer {token}', + 'Content-Type': 'application/json' + } + response = requests.post(api_url, headers=headers, json=payload) + print(response.text, file=fd) + fd.close() + return response + + def delete_notebooks(self): + host = self.host + token = self.token + fd = open(f"./logs/{self.session}/delete_notebooks.log", 'a+') + print("*" * 80, file=fd) + print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd) + fd.close() + notebooks_list = self.read_log_file(f"./logs/{self.session}/user_workspace.log") + print("Total notebooks: " + str(len(notebooks_list))) + total = len(notebooks_list) + starting_Time = time.time() + for i, notebook in enumerate(notebooks_list): + time.sleep(self.retry_backoff) + current_time = time.time() + self.progress_bar(i, total, starting_Time, current_time) + response = self.delete_workspace_obj(json.loads(notebook).get("path")) + + def get_url_token(self): + return self.url, self.token + + def url_validation(self, url): + if '/?o=' in url: + # if the workspace_id exists, lets remove it from the URL + url = re.sub("/?o=.*", '', url) + elif 'net/' == url[-4:]: + url = url[:-1] + elif 'com/' == url[-4:]: + url = url[:-1] + return url.rstrip("/") + + def get_login_credentials(self, profile='DEFAULT'): + creds_path = '~/.databrickscfg' + config = configparser.ConfigParser() + abs_creds_path = os.path.expanduser(creds_path) + config.read(abs_creds_path) + try: + current_profile = dict(config[profile]) + if not current_profile: + raise ValueError(f"Unable to find a defined profile to run this tool. Profile '{profile}' not found.") + return current_profile + except KeyError: + raise ValueError( + 'Unable to find credentials to load for profile. Profile only supports tokens.') + + +class InputHandler(object): + def __init__(self): + pass + + def get(self): + parser = argparse.ArgumentParser(description='Delete databricks Jobs') + parser.add_argument('-p', '--profile', dest='profile', required=True, help="Databricks Server URL") + parser.add_argument('-c', '--check-file', dest='check_file', required=False, help="Check for job name in file") + parser.add_argument('-s', '--session', dest='session', required=False, help="Session name") + parser.add_argument('-t', '--task', dest='task', required=False, help="Task to perform. One of 'delete_jobs', 'delete_notebooks', 'delete_clusters'", default='delete_jobs') + parser.add_argument('--retry-backoff', dest='retry_backoff', required=False, help="Retry backoff time", default=1.0) + + parse_input = parser.parse_args() + + if not parse_input.check_file and parse_input.task == 'delete_jobs': + print("Check file not provided or not found") + user_response = input("Do you want to continue without check file (y/n): ") + if user_response.lower() != 'y': + parser.print_help() + sys.exit(1) + + return parse_input + + +if __name__ == '__main__': + input_handler = InputHandler() + parse_input = input_handler.get() + dbObj = Databricks(profile=parse_input.profile, check_file=parse_input.check_file, session=parse_input.session, retry_backoff=parse_input.retry_backoff) + if parse_input.task == 'delete_jobs': + dbObj.delete_jobs() + elif parse_input.task == 'delete_notebooks': + dbObj.delete_notebooks() + elif parse_input.task == 'delete_clusters': + dbObj.delete_clusters() \ No newline at end of file diff --git a/utils/jobs_dbr_modification.py b/utils/jobs_dbr_modification.py new file mode 100644 index 00000000..7bd1d880 --- /dev/null +++ b/utils/jobs_dbr_modification.py @@ -0,0 +1,76 @@ +import json +import sys + +def modify_json_file(input_file, output_file, new_dbr_job_ids, new_spark_version, default_spark_version): + try: + with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: + # Read each line from the input file + for line in infile: + try: + # Parse the JSON string into a dictionary + data = json.loads(line) + + # Modify the spark_version in the new_cluster + # if "settings" in data and "new_cluster" in data["settings"]: + # data["settings"]["new_cluster"]["spark_version"] = new_spark_version + + job_id = data.get("job_id") + + + if job_id in new_dbr_job_ids: + + spark_version_to_use = new_spark_version + + else: + + spark_version_to_use = default_spark_version + + + if "job_clusters" in data['settings']: + for i, job_cluster in enumerate(data['settings']["job_clusters"]): + + + data['settings']["job_clusters"][i]['new_cluster']['spark_version'] = spark_version_to_use + + + if "tasks" in data["settings"].keys(): + # Multi-task + for i, task in enumerate(data["settings"]["tasks"]): + + + if "new_cluster" in task: + + data["settings"]["tasks"][i]["new_cluster"]['spark_version'] = spark_version_to_use + + + else: + # Single-task + + if "new_cluster" in data['settings'].keys(): + + data["settings"]["new_cluster"]['spark_version'] = spark_version_to_use + + + # Convert the modified dictionary back into a JSON string + modified_json_line = json.dumps(data) + + # Write the modified JSON to the output file + outfile.write(modified_json_line + '\n') + + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}", file=sys.stderr) + + except IOError as e: + print(f"Error opening or writing to file: {e}", file=sys.stderr) + +if __name__ == "__main__": + # Replace 'input.json' and 'output.json' with your actual file paths + input_file = './jobs_logs_testing/LL_jobs.log' + output_file = './jobs_logs_testing/LL_updated_jobs.log' + + new_dbr_job_ids = [1009, 863] + + # Modify the JSON file with the new spark version + modify_json_file(input_file, output_file, new_dbr_job_ids, new_spark_version="15.4.x-scala2.12", default_spark_version= "14.3.x-scala2.12") + + print(f"Modified JSON written to {output_file}") From d6c8591b871bbac75d2f6132220791b60f55e21a Mon Sep 17 00:00:00 2001 From: cbartholomew2 <89409387+cbartholomew2@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:51:23 -0400 Subject: [PATCH 105/111] Update default_jobs_cluster_aws.json --- data/default_jobs_cluster_aws.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/default_jobs_cluster_aws.json b/data/default_jobs_cluster_aws.json index e6ab94b6..c1d58b05 100644 --- a/data/default_jobs_cluster_aws.json +++ b/data/default_jobs_cluster_aws.json @@ -1,6 +1,6 @@ { "num_workers": 8, - "spark_version": "7.3.x-scala2.12", + "spark_version": "14.3.x-scala2.12", "node_type_id": "i3.xlarge", "spark_env_vars": { "PYSPARK_PYTHON": "/databricks/python3/bin/python3" From 5e04efc155002d9361c0c01bc740e55e5dcd1818 Mon Sep 17 00:00:00 2001 From: cbartholomew2 <89409387+cbartholomew2@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:51:43 -0400 Subject: [PATCH 106/111] Update default_jobs_cluster_aws_hipaa.json --- data/default_jobs_cluster_aws_hipaa.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/default_jobs_cluster_aws_hipaa.json b/data/default_jobs_cluster_aws_hipaa.json index da0f2e65..42e12cf5 100644 --- a/data/default_jobs_cluster_aws_hipaa.json +++ b/data/default_jobs_cluster_aws_hipaa.json @@ -1,6 +1,6 @@ { "num_workers": 8, - "spark_version": "7.3.x-scala2.12", + "spark_version": "14.3.x-scala2.12", "node_type_id": "i4i.xlarge", "spark_env_vars": { "PYSPARK_PYTHON": "/databricks/python3/bin/python3" From a1d85d60f6fc0901abd68ff5637d11879c71076f Mon Sep 17 00:00:00 2001 From: cbartholomew2 <89409387+cbartholomew2@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:54:48 -0400 Subject: [PATCH 107/111] Update nitro_mapping.csv --- data/nitro_mapping.csv | 152 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/data/nitro_mapping.csv b/data/nitro_mapping.csv index 7015847c..74bde2ed 100644 --- a/data/nitro_mapping.csv +++ b/data/nitro_mapping.csv @@ -148,4 +148,154 @@ z1d.xlarge,r5n.xlarge,FALSE z1d.2xlarge,r5n.2xlarge,FALSE z1d.3xlarge,r5n.4xlarge,FALSE z1d.6xlarge,r5n.8xlarge,FALSE -z1d.12xlarge,r5n.12xlarge,FALSE \ No newline at end of file +z1d.12xlarge,r5n.12xlarge,FALSE +m5n.large,m5n.large,FALSE +m5n.xlarge,m5n.xlarge,FALSE +m5n.2xlarge,m5n.2xlarge,FALSE +m5n.4xlarge,m5n.4xlarge,FALSE +m5n.12xlarge,m5n.12xlarge,FALSE +m5n.16xlarge,m5n.16xlarge,FALSE +m5n.large,m5n.large,FALSE +m5n.xlarge,m5n.xlarge,FALSE +m5n.2xlarge,m5n.2xlarge,FALSE +m5n.4xlarge,m5n.4xlarge,FALSE +m5n.8xlarge,m5n.8xlarge,FALSE +m5n.12xlarge,m5n.12xlarge,FALSE +m5n.16xlarge,m5n.16xlarge,FALSE +m5n.24xlarge,m5n.24xlarge,FALSE +m5dn.large,m5dn.large,FALSE +m5dn.xlarge,m5dn.xlarge,FALSE +m5dn.2xlarge,m5dn.2xlarge,FALSE +m5dn.4xlarge,m5dn.4xlarge,FALSE +m5dn.8xlarge,m5dn.8xlarge,FALSE +m5dn.12xlarge,m5dn.12xlarge,FALSE +m5dn.16xlarge,m5dn.16xlarge,FALSE +m5dn.24xlarge,m5dn.24xlarge,FALSE +m5n.large,m5n.large,FALSE +m5n.xlarge,m5n.xlarge,FALSE +m5n.2xlarge,m5n.2xlarge,FALSE +m5n.4xlarge,m5n.4xlarge,FALSE +m5n.8xlarge,m5n.8xlarge,FALSE +m5n.12xlarge,m5n.12xlarge,FALSE +m5n.16xlarge,m5n.16xlarge,FALSE +m5n.24xlarge,m5n.24xlarge,FALSE +m5n.large,m5n.large,FALSE +m5n.xlarge,m5n.xlarge,FALSE +m5n.2xlarge,m5n.2xlarge,FALSE +m5n.4xlarge,m5n.4xlarge,FALSE +m5n.8xlarge,m5n.8xlarge,FALSE +m5n.12xlarge,m5n.12xlarge,FALSE +m5n.16xlarge,m5n.16xlarge,FALSE +m5dn.large,m5dn.large,FALSE +m5dn.xlarge,m5dn.xlarge,FALSE +m5dn.2xlarge,m5dn.2xlarge,FALSE +m5dn.4xlarge,m5dn.4xlarge,FALSE +m5dn.8xlarge,m5dn.8xlarge,FALSE +m5dn.12xlarge,m5dn.12xlarge,FALSE +m5dn.16xlarge,m5dn.16xlarge,FALSE +c5a.2xlarge,c5a.2xlarge,FALSE +c5a.4xlarge,c5a.4xlarge,FALSE +c5a.8xlarge,c5a.8xlarge,FALSE +c5a.xlarge,c5a.xlarge,FALSE +c5a.2xlarge,c5a.2xlarge,FALSE +c5a.4xlarge,c5a.4xlarge,FALSE +c5a.8xlarge,c5a.8xlarge,FALSE +c5a.12xlarge,c5a.12xlarge,FALSE +c5a.16xlarge,c5a.16xlarge,FALSE +c5a.24xlarge,c5a.24xlarge,FALSE +c5ad.xlarge,c5ad.xlarge,FALSE +c5ad.2xlarge,c5ad.2xlarge,FALSE +c5ad.4xlarge,c5ad.4xlarge,FALSE +c5ad.8xlarge,c5ad.8xlarge,FALSE +c5ad.12xlarge,c5ad.12xlarge,FALSE +c5ad.16xlarge,c5ad.16xlarge,FALSE +c5ad.24xlarge,c5ad.24xlarge,FALSE +c5a.xlarge,c5a.xlarge,FALSE +c5a.2xlarge,c5a.2xlarge,FALSE +c5a.4xlarge,c5a.4xlarge,FALSE +c5a.8xlarge,c5a.8xlarge,FALSE +c5a.12xlarge,c5a.12xlarge,FALSE +c5a.16xlarge,c5a.16xlarge,FALSE +c5ad.xlarge,c5ad.xlarge,FALSE +c5ad.2xlarge,c5ad.2xlarge,FALSE +c5ad.4xlarge,c5ad.4xlarge,FALSE +c5ad.8xlarge,c5ad.8xlarge,FALSE +c5ad.12xlarge,c5ad.12xlarge,FALSE +c5ad.16xlarge,c5ad.16xlarge,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.16xlarge,r5n.16xlarge,FALSE +r5n.large,r5n.large,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.12xlarge,r5n.12xlarge,FALSE +r5n.16xlarge,r5n.16xlarge,FALSE +r5n.24xlarge,r5n.24xlarge,FALSE +r5dn.large,r5dn.large,FALSE +r5dn.xlarge,r5dn.xlarge,FALSE +r5dn.2xlarge,r5dn.2xlarge,FALSE +r5dn.4xlarge,r5dn.4xlarge,FALSE +r5dn.8xlarge,r5dn.8xlarge,FALSE +r5dn.12xlarge,r5dn.12xlarge,FALSE +r5dn.16xlarge,r5dn.16xlarge,FALSE +r5dn.24xlarge,r5dn.24xlarge,FALSE +r5n.large,r5n.large,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.12xlarge,r5n.12xlarge,FALSE +r5n.16xlarge,r5n.16xlarge,FALSE +r5n.24xlarge,r5n.24xlarge,FALSE +r5n.large,r5n.large,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.12xlarge,r5n.12xlarge,FALSE +r5n.16xlarge,r5n.16xlarge,FALSE +r5dn.large,r5dn.large,FALSE +r5dn.xlarge,r5dn.xlarge,FALSE +r5dn.2xlarge,r5dn.2xlarge,FALSE +r5dn.4xlarge,r5dn.4xlarge,FALSE +r5dn.8xlarge,r5dn.8xlarge,FALSE +r5dn.12xlarge,r5dn.12xlarge,FALSE +r5dn.16xlarge,r5dn.16xlarge,FALSE +i4i.large,i4i.large,FALSE +i4i.xlarge,i4i.xlarge,FALSE +i4i.2xlarge,i4i.2xlarge,FALSE +i4i.4xlarge,i4i.4xlarge,FALSE +i4i.8xlarge,i4i.8xlarge,FALSE +i4i.16xlarge,i4i.16xlarge,FALSE +i3en.xlarge,i3en.xlarge,FALSE +i3en.2xlarge,i3en.2xlarge,FALSE +i3en.3xlarge,i3en.3xlarge,FALSE +i3en.6xlarge,i3en.6xlarge,FALSE +g4ad.4xlargeGPU,g4ad.4xlargeGPU,FALSE +g4ad.8xlargeGPU,g4ad.8xlargeGPU,FALSE +g4ad.16xlargeGPU,g4ad.16xlargeGPU,FALSE +g4ad.4xlargeGPU,g4ad.4xlargeGPU,FALSE +g4ad.8xlargeGPU,g4ad.8xlargeGPU,FALSE +g4ad.16xlargeGPU,g4ad.16xlargeGPU,FALSE +g4dn.xlargeGPU,g4dn.xlargeGPU,FALSE +g4dn.2xlargeGPU,g4dn.2xlargeGPU,FALSE +g4dn.4xlargeGPU,g4dn.4xlargeGPU,FALSE +g4dn.8xlargeGPU,g4dn.8xlargeGPU,FALSE +g4dn.12xlargeGPU,g4dn.12xlargeGPU,FALSE +g4dn.16xlargeGPU,g4dn.16xlargeGPU,FALSE +p3dn.24xlargeGPU,p3dn.24xlargeGPU,FALSE +p3dn.24xlargeGPU,p3dn.24xlargeGPU,FALSE +r5n.large,r5n.large,FALSE +r5n.xlarge,r5n.xlarge,FALSE +r5n.2xlarge,r5n.2xlarge,FALSE +r5n.4xlarge,r5n.4xlarge,FALSE +r5n.8xlarge,r5n.8xlarge,FALSE +r5n.12xlarge,r5n.12xlarge,FALSE From 258f4ca3a042643ed582cdf46e8bbd18bbab2d9d Mon Sep 17 00:00:00 2001 From: mcmuffin18 Date: Tue, 3 Sep 2024 12:29:09 -0400 Subject: [PATCH 108/111] Update HMS_Modification_Get_Database.py --- utils/HMS_Modification_Get_Database.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/HMS_Modification_Get_Database.py b/utils/HMS_Modification_Get_Database.py index 59247589..a9258e54 100644 --- a/utils/HMS_Modification_Get_Database.py +++ b/utils/HMS_Modification_Get_Database.py @@ -75,6 +75,7 @@ def update_metastore(self): location = db_details_dict[db] + "/" + table else: print(f"ERROR: Database {db} not found in database details log") + continue new_ddl = ddl + "\nLOCATION '" + location + "'" @@ -108,4 +109,4 @@ def parser(): updater = MetastoreUpdater(args.metastore_logs, args.root_bucket, args.mount_point, args.database_details_log) updater.duplicate_metastore_as_backup() updater.update_metastore() - updater.analyze_performance() \ No newline at end of file + updater.analyze_performance() From 1b73e7bef8beaf3fbf4e48e887571b90a34f6d56 Mon Sep 17 00:00:00 2001 From: James Parham Date: Thu, 26 Jun 2025 11:56:08 -0400 Subject: [PATCH 109/111] added use-logs flag, pagination, and concurrent futures --- .gitignore | 1 + WorkspaceClient_modified.py | 6 +- dbclient/ClustersClient.py | 8 +- dbclient/ScimClient.py | 141 +++++++++++++++++++++++++++++++----- dbclient/WorkspaceClient.py | 3 +- dbclient/dbclient.py | 1 + dbclient/parser.py | 11 ++- 7 files changed, 146 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index d206ab0e..72a73cf4 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ dist/ .tox/ databricks_migration_tool.egg-info migrate.iml +export_dir/ diff --git a/WorkspaceClient_modified.py b/WorkspaceClient_modified.py index e39fd405..4666422d 100644 --- a/WorkspaceClient_modified.py +++ b/WorkspaceClient_modified.py @@ -14,6 +14,7 @@ import logging import os from dbclient.common.WorkspaceDiff import * +from dbclient.ScimClient import ScimClient WS_LIST = "/workspace/list" WS_STATUS = "/workspace/get-status" @@ -26,6 +27,7 @@ class WorkspaceClient(dbclient): def __init__(self, configs, checkpoint_service): super().__init__(configs) + self.scim_client = ScimClient(configs, checkpoint_service) self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) self.skip_missing_users = configs['skip_missing_users'] @@ -463,7 +465,9 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer notebooks = self.filter_workspace_items(items, 'NOTEBOOK') libraries = self.filter_workspace_items(items, 'LIBRARY') # only get user list if we are filtering by group - ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] + # ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] + ws_users = self.scim_client.get_active_users() if self.groups_to_keep else [] + for x in notebooks: # notebook objects has path and object_id nb_path = x.get('path') diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py index ac75b63f..0b0ac89c 100644 --- a/dbclient/ClustersClient.py +++ b/dbclient/ClustersClient.py @@ -6,6 +6,7 @@ import logging_utils import wmconstants from dbclient import * +from .ScimClient import ScimClient class ClustersClient(dbclient): @@ -13,6 +14,7 @@ def __init__(self, configs, checkpoint_service): super().__init__(configs) self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) + self.scim_client = ScimClient(configs, checkpoint_service) self.skip_missing_users = configs['skip_missing_users'] self.hipaa = configs.get('hipaa', False) self.bypass_secret_acl = configs.get('bypass_secret_acl', False) @@ -617,7 +619,8 @@ def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_cluster # get users list based on groups_to_keep users_list = [] if self.groups_to_keep is not False: - all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + all_users = self.scim_client.get_active_users() users_list = list(set([user.get("emails")[0].get("value") for user in all_users for group in user.get("groups") if group.get("display") in self.groups_to_keep])) @@ -693,7 +696,8 @@ def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='ac # get users list based on groups_to_keep users_list = [] if self.groups_to_keep is not False: - all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + # all_users = self.get('/preview/scim/v2/Users').get('Resources', None) + all_users = self.scim_client.get_active_users() users_list = list(set([user.get("emails")[0].get("value") for user in all_users for group in user.get("groups") if group.get("display") in self.groups_to_keep])) diff --git a/dbclient/ScimClient.py b/dbclient/ScimClient.py index 96f856e0..7545246f 100644 --- a/dbclient/ScimClient.py +++ b/dbclient/ScimClient.py @@ -7,40 +7,70 @@ import concurrent from concurrent.futures import ThreadPoolExecutor from threading_utils import propagate_exceptions +import concurrent.futures class ScimClient(dbclient): def __init__(self, configs, checkpoint_service): super().__init__(configs) self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) + self.users_list = self.get_users_full_from_log() + + + def fetch_page(self, start, count): + endpoint = f'/preview/scim/v2/Users?startIndex={start}&count={count}' + response = self.get(endpoint) + return response.get('Resources', []) def get_active_users(self): - users = self.get('/preview/scim/v2/Users').get('Resources', None) - return users if users else None + + if self._use_logs and self.users_list is None: + results = self.get_users_full_from_log() + elif self._use_logs: + results = self.users_list + + if results is None: + page_size = 10 + first_response = self.get(f'/preview/scim/v2/Users?startIndex=1&count=1') + total = first_response.get('totalResults', 0) + if total == 0: + return None + + indices = range(1, total + 1, page_size) + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(self.fetch_page, i, page_size) for i in indices] + results = [] + for future in concurrent.futures.as_completed(futures): + results.extend(future.result()) + + return results or None def log_all_users(self, log_file='users.log'): user_log = self.get_export_dir() + log_file - users = self.get('/preview/scim/v2/Users').get('Resources', None) - if users: + + all_users = self.get_active_users() + + if all_users: with open(user_log, "w", encoding="utf-8") as fp: - for x in users: + for x in all_users: fullname = x.get('name', None) - # if a group list has been passed, check to see if current user is part of groups if self.groups_to_keep: - user_groups = [g['display'] for g in x.get('groups')] + user_groups = [g['display'] for g in x.get('groups', [])] if not set(user_groups).intersection(set(self.groups_to_keep)): continue if fullname: given_name = fullname.get('givenName', None) - # if user is an admin, skip this user entry if x['userName'] == 'admin' and given_name == 'Administrator': continue + fp.write(json.dumps(x) + '\n') else: logging.info("Users returned an empty object") + def log_single_user(self, user_email, log_file='single_user.log'): single_user_log = self.get_export_dir() + log_file users = self.get_active_users() @@ -69,12 +99,38 @@ def get_users_from_log(self, users_log='users.log'): :return: a list of usernames that help identify their workspace paths """ user_logfile = self.get_export_dir() + users_log + username_list = [] - with open(user_logfile, 'r', encoding="utf-8") as fp: - for u in fp: + if self.users_list is None: + with open(user_logfile, 'r', encoding="utf-8") as fp: + for u in fp: + user_json = json.loads(u) + username_list.append(user_json.get('userName')) + else: + for u in self.users_list: user_json = json.loads(u) - username_list.append(user_json.get('userName')) + username_list.append(user_json.get('userName')) + return username_list + + + def get_users_full_from_log(self, users_log='users.log'): + """ + fetch a list of user names from the users log file + meant to be used during group exports where the user list is a subset of users + :param users_log: + :return: a list of usernames that help identify their workspace paths + """ + user_logfile = self.get_export_dir() + users_log + if os.path.isfile(user_logfile): + username_list = [] + with open(user_logfile, 'r', encoding="utf-8") as fp: + for u in fp: + user_json = json.loads(u) + username_list.append(user_json) + return username_list + else: + return None @staticmethod def is_member_a_user(member_json): @@ -98,10 +154,29 @@ def add_username_to_group(self, group_json): # add the userName field to json since ids across environments may not match members = group_json.get('members', []) new_members = [] + users_list = None + # try: + users_list = self.get_users_full_from_log() if self.users_list is None else self.users_list + # except Exception as e: + # logging.info(e) + for m in members: m_id = m['value'] if self.is_member_a_user(m): - user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id)) + user_resp = None + if users_list: + # print(users_list[0]['id'].__class__) + # user_resp = next((item for item in users_list if item['id'] == m_id), None) + for u in users_list: + if str(u['id']) == str(m_id): + user_resp = u + break + # user_resp = next(filter(lambda x: x.get("id") == m_id, users_list), None) + + if user_resp is None: + user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id)) + else: + user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id)) m['userName'] = user_resp['userName'] m['type'] = 'user' elif self.is_member_a_group(m): @@ -113,12 +188,36 @@ def add_username_to_group(self, group_json): new_members.append(m) group_json['members'] = new_members return group_json + + def fetch_group_page(self, start, count): + endpoint = f'/preview/scim/v2/Groups?startIndex={start}&count={count}' + response = self.get(endpoint) + return response.get('Resources', []) + + def get_active_groups(self): + page_size = 10 + first_response = self.get(f'/preview/scim/v2/Groups?startIndex=1&count=1') + total = first_response.get('totalResults', 0) + + if total == 0: + return None + + indices = range(1, total + 1, page_size) + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(self.fetch_group_page, i, page_size) for i in indices] + results = [] + for future in concurrent.futures.as_completed(futures): + results.extend(future.result()) + return results or None + def log_all_groups(self, group_log_dir='groups/'): group_dir = self.get_export_dir() + group_log_dir os.makedirs(group_dir, exist_ok=True) - group_list = self.get("/preview/scim/v2/Groups").get('Resources', []) + group_list = self.get_active_groups() for x in group_list: + logging.info(f"group: {x}") group_name = x['displayName'] # if groups_to_keep is defined, check to see if current group is a member @@ -126,7 +225,7 @@ def log_all_groups(self, group_log_dir='groups/'): if group_name not in self.groups_to_keep: continue - with open(group_dir + group_name, "w", encoding="utf-8") as fp: + with open(group_dir + group_name.replace("/", "_"), "w", encoding="utf-8") as fp: fp.write(json.dumps(self.add_username_to_group(x))) @staticmethod @@ -146,7 +245,8 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l """ group_dir = self.get_export_dir() + group_log_dir os.makedirs(group_dir, exist_ok=True) - group_list = self.get("/preview/scim/v2/Groups").get('Resources', []) + # group_list = self.get("/preview/scim/v2/Groups").get('Resources', []) + group_list = self.get_active_groups() group_dict = self.build_group_dict(group_list) member_id_list = [] for group_name in group_name_list: @@ -158,7 +258,7 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l sub_group_names = list(map(lambda z: z.get('display'), filtered_sub_groups)) group_name_list.extend(sub_group_names) member_id_list.extend(list(map(lambda y: y['value'], filtered_users))) - with open(group_dir + group_name, "w", encoding="utf-8") as fp: + with open(group_dir + group_name.replace("/", "_"), "w", encoding="utf-8") as fp: group_details.pop('roles', None) # removing the roles field from the groups arg fp.write(json.dumps(self.add_username_to_group(group_details))) users_log = self.get_export_dir() + users_logfile @@ -176,7 +276,8 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l def get_user_id_mapping(self): # return a dict of the userName to id mapping of the new env - user_list = self.get('/preview/scim/v2/Users').get('Resources', None) + # user_list = self.get('/preview/scim/v2/Users').get('Resources', None) + user_list = self.get_active_users() if user_list: user_id_dict = {} for user in user_list: @@ -222,7 +323,7 @@ def assign_group_entitlements(self, group_dir, error_logger): return groups = self.listdir(group_dir) for group_name in groups: - with open(group_dir + group_name, 'r', encoding="utf-8") as fp: + with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp: group_data = json.loads(fp.read()) entitlements = group_data.get('entitlements', None) if entitlements: @@ -239,7 +340,7 @@ def assign_group_roles(self, group_dir, error_logger): return groups = self.listdir(group_dir) for group_name in groups: - with open(group_dir + group_name, 'r', encoding="utf-8") as fp: + with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp: group_data = json.loads(fp.read()) roles = group_data.get('roles', None) if roles: @@ -418,7 +519,7 @@ def import_groups(self, group_dir, current_user_ids, error_logger): # dict of { old_user_id : email } old_user_emails = self.get_old_user_emails() for group_name in groups: - with open(group_dir + group_name, 'r', encoding="utf-8") as fp: + with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp: members = json.loads(fp.read()).get('members', None) logging.info(f"Importing group {group_name} :") if members: diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py index e39fd405..04dde05b 100644 --- a/dbclient/WorkspaceClient.py +++ b/dbclient/WorkspaceClient.py @@ -463,7 +463,8 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer notebooks = self.filter_workspace_items(items, 'NOTEBOOK') libraries = self.filter_workspace_items(items, 'LIBRARY') # only get user list if we are filtering by group - ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] + # ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] + ws_users = ws_users = ScimClient.get_active_users() if self.groups_to_keep else [] for x in notebooks: # notebook objects has path and object_id nb_path = x.get('path') diff --git a/dbclient/dbclient.py b/dbclient/dbclient.py index 54a10ea5..dfca4189 100644 --- a/dbclient/dbclient.py +++ b/dbclient/dbclient.py @@ -56,6 +56,7 @@ def __init__(self, configs): self._url = url_validation(configs['url']) self._update_token(configs['token']) self._export_dir = configs['export_dir'] + self._use_logs = configs['use_logs'] self._is_aws = configs['is_aws'] self._is_azure = configs['is_azure'] self._is_gcp = configs['is_gcp'] diff --git a/dbclient/parser.py b/dbclient/parser.py index 8088da90..03dc8a7c 100644 --- a/dbclient/parser.py +++ b/dbclient/parser.py @@ -141,7 +141,8 @@ def get_export_parser(): parser.add_argument('--database', action='store', help='Database name to export for the metastore and table ACLs. Single database name supported') - # iam role used to export the metastore + # iam role used to export the + parser.add_argument('--iam', action='store', help='IAM Instance Profile to export metastore entires') @@ -191,6 +192,9 @@ def get_export_parser(): parser.add_argument('--set-export-dir', action='store', help='Set the base directory to export artifacts') + + parser.add_argument('--use-logs', action='store_true', + help='Set flag to use export logs if they exists') parser.add_argument('--pause-all-jobs', action='store_true', help='Pause all scheduled jobs') @@ -453,6 +457,7 @@ def build_client_config(profile, url, token, args): config['export_dir'] = 'gcp_logs/' config['use_checkpoint'] = args.use_checkpoint + config['use_logs'] = args.use_logs config['num_parallel'] = args.num_parallel config['retry_total'] = args.retry_total config['retry_backoff'] = args.retry_backoff @@ -489,6 +494,9 @@ def get_pipeline_parser() -> argparse.ArgumentParser: parser.add_argument('--set-export-dir', action='store', help='Set the base directory to export artifacts') + + parser.add_argument('--use-logs', action='store_true', + help='Set flag to use export logs if they exists') parser.add_argument('--cluster-name', action='store', required=False, help='Cluster name to export the metastore to a specific cluster. Cluster will be started.') @@ -506,6 +514,7 @@ def get_pipeline_parser() -> argparse.ArgumentParser: # Cluster + Job arguments parser.add_argument('--nitro', action='store_true', help='Set to use Nitro cluster types for all clusters and jobs.') + # Jobs arguments parser.add_argument('--default-job-owner', action='store', default=False, From f8be476da802c403debfb61586684ee1556d42b1 Mon Sep 17 00:00:00 2001 From: James Parham Date: Thu, 26 Jun 2025 12:23:57 -0400 Subject: [PATCH 110/111] added scim_client to workspaceclietn for retrieving users --- dbclient/WorkspaceClient.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py index 04dde05b..e7927684 100644 --- a/dbclient/WorkspaceClient.py +++ b/dbclient/WorkspaceClient.py @@ -29,6 +29,7 @@ def __init__(self, configs, checkpoint_service): self._checkpoint_service = checkpoint_service self.groups_to_keep = configs.get("groups_to_keep", False) self.skip_missing_users = configs['skip_missing_users'] + self.scim_client = ScimClient(configs, checkpoint_service) _languages = {'.py': 'PYTHON', '.scala': 'SCALA', @@ -464,7 +465,7 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer libraries = self.filter_workspace_items(items, 'LIBRARY') # only get user list if we are filtering by group # ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else [] - ws_users = ws_users = ScimClient.get_active_users() if self.groups_to_keep else [] + ws_users = self.scim_client.get_active_users() if self.groups_to_keep else [] for x in notebooks: # notebook objects has path and object_id nb_path = x.get('path') From 9167972b50f8d46114a13558d72db2eed4a9553c Mon Sep 17 00:00:00 2001 From: James Parham Date: Wed, 9 Jul 2025 11:52:37 -0400 Subject: [PATCH 111/111] added results=None param to get active users in ScimClient --- .gitignore | 2 ++ dbclient/ScimClient.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 72a73cf4..a1a643ca 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ dist/ databricks_migration_tool.egg-info migrate.iml export_dir/ +unversioned/ + diff --git a/dbclient/ScimClient.py b/dbclient/ScimClient.py index 7545246f..6277020b 100644 --- a/dbclient/ScimClient.py +++ b/dbclient/ScimClient.py @@ -22,7 +22,7 @@ def fetch_page(self, start, count): response = self.get(endpoint) return response.get('Resources', []) - def get_active_users(self): + def get_active_users(self, results=None): if self._use_logs and self.users_list is None: results = self.get_users_full_from_log()