diff --git a/.gitignore b/.gitignore
index d206ab0..a1a643c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@ dist/
.tox/
databricks_migration_tool.egg-info
migrate.iml
+export_dir/
+unversioned/
+
diff --git a/Root Hive Migration.dbc b/Root Hive Migration.dbc
new file mode 100644
index 0000000..e9323d6
Binary files /dev/null and b/Root Hive Migration.dbc differ
diff --git a/Workspace Sizing Notebook.html b/Workspace Sizing Notebook.html
new file mode 100644
index 0000000..889a597
--- /dev/null
+++ b/Workspace Sizing Notebook.html
@@ -0,0 +1,43 @@
+
+
+
+
+Workspace Sizing Notebook - Databricks
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/WorkspaceClient_modified.py b/WorkspaceClient_modified.py
new file mode 100644
index 0000000..4666422
--- /dev/null
+++ b/WorkspaceClient_modified.py
@@ -0,0 +1,975 @@
+import base64
+import hashlib
+import re
+
+from dbclient import *
+import wmconstants
+import concurrent
+from concurrent.futures import ThreadPoolExecutor
+from thread_safe_writer import ThreadSafeWriter
+from threading_utils import propagate_exceptions
+from timeit import default_timer as timer
+from datetime import timedelta
+import logging_utils
+import logging
+import os
+from dbclient.common.WorkspaceDiff import *
+from dbclient.ScimClient import ScimClient
+
+WS_LIST = "/workspace/list"
+WS_STATUS = "/workspace/get-status"
+WS_MKDIRS = "/workspace/mkdirs"
+WS_IMPORT = "/workspace/import"
+WS_EXPORT = "/workspace/export"
+LS_ZONES = "/clusters/list-zones"
+REPOS = "/repos"
+
+class WorkspaceClient(dbclient):
+ def __init__(self, configs, checkpoint_service):
+ super().__init__(configs)
+ self.scim_client = ScimClient(configs, checkpoint_service)
+ self._checkpoint_service = checkpoint_service
+ self.groups_to_keep = configs.get("groups_to_keep", False)
+ self.skip_missing_users = configs['skip_missing_users']
+
+ _languages = {'.py': 'PYTHON',
+ '.scala': 'SCALA',
+ '.r': 'R',
+ '.sql': 'SQL'}
+
+ def get_language(self, file_ext):
+ return self._languages[file_ext]
+
+ def get_top_level_folders(self):
+ # get top level folders excluding the /Users path
+ supported_types = ('NOTEBOOK', 'DIRECTORY')
+ root_items = self.get(WS_LIST, {'path': '/'}).get('objects', [])
+ # filter out Projects and Users folders
+ non_users_dir = list(filter(lambda x: (x.get('path') not in ['/Users', '/Repos']
+ and x.get('path') != '/Projects'), root_items))
+ dirs_and_nbs = list(filter(lambda x: (x.get('object_type') in supported_types),
+ non_users_dir))
+ return dirs_and_nbs
+
+ def export_top_level_folders(self):
+ ls_tld = self.get_top_level_folders()
+ logged_nb_count = 0
+ workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_workspace.log', "a")
+ libs_log_writer = ThreadSafeWriter(self.get_export_dir() + 'libraries.log', "a")
+ dir_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_dirs.log', "a")
+ checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT
+ )
+ try:
+ for tld_obj in ls_tld:
+ # obj has 3 keys, object_type, path, object_id
+ tld_path = tld_obj.get('path')
+ log_count = self.log_all_workspace_items(
+ tld_path, workspace_log_writer, libs_log_writer, dir_log_writer, checkpoint_item_log_set)
+ logged_nb_count += log_count
+ finally:
+ workspace_log_writer.close()
+ libs_log_writer.close()
+ dir_log_writer.close()
+ dl_nb_count = self.download_notebooks()
+ print(f'Total logged notebooks: {logged_nb_count}')
+ print(f'Total Downloaded notebooks: {dl_nb_count}')
+
+ def get_user_import_args(self, full_local_path, nb_full_path):
+ """
+ helper function to define the import parameters to upload a notebook object
+ :param full_local_path: full local path of the notebook to read
+ :param nb_full_path: full destination path, e.g. /Users/foo@db.com/bar.dbc . Includes extension / type
+ :return: return the full input args to upload to the destination system
+ """
+ fp = open(full_local_path, "rb")
+ (nb_path_dest, nb_type) = os.path.splitext(nb_full_path)
+ in_args = {
+ "content": base64.encodebytes(fp.read()).decode('utf-8'),
+ "path": nb_path_dest,
+ "format": self.get_file_format()
+ }
+ if self.is_source_file_format():
+ if self.is_overwrite_notebooks():
+ in_args['overwrite'] = True
+ if nb_type == '.dbc':
+ raise ValueError('Export is in DBC default format. Must export as SOURCE')
+ in_args['language'] = self.get_language(nb_type)
+ in_args['object_type'] = 'NOTEBOOK'
+ return in_args
+
+ @staticmethod
+ def build_ws_lookup_table(success_ws_logfile):
+ ws_hashmap = set()
+ with open(success_ws_logfile, 'r', encoding='utf-8') as fp:
+ for line in fp:
+ ws_hashmap.add(line.rstrip())
+ return ws_hashmap
+
+ @staticmethod
+ def is_user_ws_item(ws_dir):
+ """
+ Checks if this is a user artifact / notebook.
+ We can't create user home folders, hence we need to identify user items
+ """
+ path_list = [x for x in ws_dir.split('/') if x]
+ if len(path_list) >= 2 and path_list[0] == 'Users':
+ return True
+ return False
+
+ @staticmethod
+ def is_repo(ws_dir):
+ """
+ Checks if this item is part of a repo.
+ We need to use a separate API for these, so they should not be treated as standard WS items
+ """
+ path_list = [x for x in ws_dir.split('/') if x]
+ if len(path_list) >= 2 and path_list[0] == 'Repos':
+ return True
+ return False
+
+ @staticmethod
+ def is_user_ws_root(ws_dir):
+ """
+ Check if we're at the users home folder or repos root folder to skip folder creation
+ """
+ if ws_dir in ['/Users/', '/Users', '/Repos/', '/Repos']:
+ return True
+ path_list = [x for x in ws_dir.split('/') if x]
+ if len(path_list) == 2 and path_list[0] == 'Users':
+ return True
+ return False
+
+ @staticmethod
+ def get_user(ws_dir):
+ """
+ returns the username of the workspace / folder path
+ """
+ path_list = [x for x in ws_dir.split('/') if x]
+ if len(path_list) < 2:
+ raise ValueError("Error: Not a users workspace directory")
+ return path_list[1]
+
+ @staticmethod
+ def is_user_trash(ws_path):
+ """
+ checks if this is the users home folder trash directory, which is a special dir
+ """
+ path_list = ws_path.split('/')
+ if len(path_list) == 4:
+ if path_list[1] == 'Users' and path_list[3] == 'Trash':
+ return True
+ return False
+
+ def is_user_home_empty(self, username):
+ user_root = '/Users/' + username.rstrip().lstrip()
+ get_args = {'path': user_root}
+ items = self.get(WS_LIST, get_args).get('objects', None)
+ if items:
+ folders = self.filter_workspace_items(items, 'DIRECTORY')
+ notebooks = self.filter_workspace_items(items, 'NOTEBOOK')
+ # if both notebooks and directories are empty, return true
+ if not folders and not notebooks:
+ return True
+ return False
+ return True
+
+ def get_num_of_saved_users(self, export_dir):
+ """
+ returns the number of exported user items to check against number of created users in the new workspace
+ this helps identify if the new workspace is ready for the import, or if we should skip / archive failed imports
+ """
+ # get current number of saved workspaces
+ user_home_dir = export_dir + 'Users'
+ num_of_users = 0
+ if os.path.exists(user_home_dir):
+ ls = self.listdir(user_home_dir)
+ for x in ls:
+ if os.path.isdir(user_home_dir + '/' + x):
+ num_of_users += 1
+ return num_of_users
+
+ def export_user_home(self, username, local_export_dir, num_parallel=4):
+ """
+ Export the provided user's home directory
+ :param username: user's home directory to export
+ :param local_export_dir: folder location to do single user exports
+ :return: None
+ """
+ original_export_dir = self.get_export_dir()
+ user_export_dir = self.get_export_dir() + local_export_dir
+ user_root = '/Users/' + username.rstrip().lstrip()
+ self.set_export_dir(user_export_dir + '/{0}/'.format(username))
+ print("Export path: {0}".format(self.get_export_dir()))
+ os.makedirs(self.get_export_dir(), exist_ok=True)
+ workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_workspace.log', "a")
+ libs_log_writer = ThreadSafeWriter(self.get_export_dir() + 'libraries.log', "a")
+ dir_log_writer = ThreadSafeWriter(self.get_export_dir() + 'user_dirs.log', "a")
+ checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT
+ )
+ try:
+ num_of_nbs = self.log_all_workspace_items(
+ user_root, workspace_log_writer, libs_log_writer, dir_log_writer, checkpoint_item_log_set)
+ finally:
+ workspace_log_writer.close()
+ libs_log_writer.close()
+ dir_log_writer.close()
+
+ if num_of_nbs == 0:
+ raise ValueError('User does not have any notebooks in this path. Please verify the case of the email')
+ num_of_nbs_dl = self.download_notebooks(ws_dir='user_artifacts/')
+ print(f"Total notebooks logged: {num_of_nbs}")
+ print(f"Total notebooks downloaded: {num_of_nbs_dl}")
+ if num_of_nbs != num_of_nbs_dl:
+ print(f"Notebooks logged != downloaded. Check the failed download file at: {user_export_dir}")
+ print(f"Exporting the notebook permissions for {username}")
+ acl_notebooks_writer = ThreadSafeWriter("acl_notebooks.log", "w")
+ acl_notebooks_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir())
+ try:
+ self.log_acl_to_file(
+ 'notebooks', 'user_workspace.log', acl_notebooks_writer, acl_notebooks_error_logger, num_parallel)
+ finally:
+ acl_notebooks_writer.close()
+
+ print(f"Exporting the directories permissions for {username}")
+ acl_directories_writer = ThreadSafeWriter("acl_directories.log", "w")
+ acl_directories_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir())
+ try:
+ self.log_acl_to_file(
+ 'directories', 'user_dirs.log', acl_directories_writer, acl_directories_error_logger, num_parallel)
+ finally:
+ acl_directories_writer.close()
+ # reset the original export dir for other calls to this method using the same client
+ self.set_export_dir(original_export_dir)
+
+ def import_user_home(self, username, local_export_dir):
+ """
+ Import the provided user's home directory
+ logs/user_exports/{{USERNAME}}/ stores the log files to understand what was exported
+ logs/user_exports/{{USERNAME}}/user_artifacts/ stores the notebook contents
+ :param username: user's home directory to export
+ :param local_export_dir: the log directory for this users workspace items
+ :return: None
+ """
+ original_export_dir = self.get_export_dir()
+ user_import_dir = self.get_export_dir() + local_export_dir
+ if self.does_user_exist(username):
+ print("Yes, we can upload since the user exists")
+ else:
+ print("User must exist before we upload the notebook contents. Please add the user to the platform first")
+ user_root = '/Users/' + username.rstrip().lstrip()
+ self.set_export_dir(user_import_dir + '/{0}/'.format(username))
+ print("Import local path: {0}".format(self.get_export_dir()))
+ notebook_dir = self.get_export_dir() + 'user_artifacts/'
+ for root, subdirs, files in self.walk(notebook_dir):
+ upload_dir = '/' + root.replace(notebook_dir, '')
+ # if the upload dir is the 2 root directories, skip and continue
+ if upload_dir == '/' or upload_dir == '/Users':
+ continue
+ if not self.is_user_ws_root(upload_dir):
+ # if it is not the /Users/example@example.com/ root path, don't create the folder
+ resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
+ print(resp_mkdirs)
+ for f in files:
+ # get full path for the local notebook file
+ local_file_path = os.path.join(root, f)
+ # create upload path and remove file format extension
+ ws_file_path = upload_dir + '/' + f
+ # generate json args with binary data for notebook to upload to the workspace path
+ nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
+ # call import to the workspace
+ if self.is_verbose():
+ print("Path: {0}".format(nb_input_args['path']))
+ resp_upload = self.post(WS_IMPORT, nb_input_args)
+ if self.is_verbose():
+ print(resp_upload)
+
+ # import the user's workspace ACLs
+ notebook_acl_logs = user_import_dir + f'/{username}/acl_notebooks.log'
+ acl_notebooks_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir())
+ if os.path.exists(notebook_acl_logs):
+ print(f"Importing the notebook acls for {username}")
+ with open(notebook_acl_logs, encoding='utf-8') as nb_acls_fp:
+ for nb_acl_str in nb_acls_fp:
+ self.apply_acl_on_object(nb_acl_str, acl_notebooks_error_logger)
+
+ dir_acl_logs = user_import_dir + f'/{username}/acl_directories.log'
+ acl_dir_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir())
+ if os.path.exists(dir_acl_logs):
+ print(f"Importing the directory acls for {username}")
+ with open(dir_acl_logs, encoding='utf-8') as dir_acls_fp:
+ for dir_acl_str in dir_acls_fp:
+ self.apply_acl_on_object(dir_acl_str, acl_dir_error_logger)
+ self.set_export_dir(original_export_dir)
+
+ def download_notebooks(self, ws_log_file='user_workspace.log', ws_dir='artifacts/', num_parallel=4):
+ """
+ Loop through all notebook paths in the logfile and download individual notebooks
+ :param ws_log_file: logfile for all notebook paths in the workspace
+ :param ws_dir: export directory to store all notebooks
+ :return: None
+ """
+ checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT)
+ ws_log = self.get_export_dir() + ws_log_file
+ notebook_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT, self.get_export_dir())
+ num_notebooks = 0
+ if not os.path.exists(ws_log):
+ raise Exception("Run --workspace first to download full log of all notebooks.")
+ with open(ws_log, "r", encoding='utf-8') as fp:
+ # notebook log metadata file now contains object_id to help w/ ACL exports
+ # pull the path from the data to download the individual notebook contents
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(self.download_notebook_helper, notebook_data, checkpoint_notebook_set, notebook_error_logger, self.get_export_dir() + ws_dir) for notebook_data in fp]
+ for future in concurrent.futures.as_completed(futures):
+ dl_resp = future.result()
+ if 'error' not in dl_resp:
+ num_notebooks += 1
+ return num_notebooks
+
+ def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error_logger, export_dir='artifacts/'):
+ """
+ Helper function to download an individual notebook, or log the failure in the failure logfile
+ :param notebook_path: an individual notebook path
+ :param export_dir: directory to store all notebooks
+ :return: return the notebook path that's successfully downloaded
+ """
+ notebook_path = json.loads(notebook_data).get('path', None).rstrip('\n')
+ if checkpoint_notebook_set.contains(notebook_path):
+ return {'path': notebook_path}
+ get_args = {'path': notebook_path, 'format': self.get_file_format()}
+ if self.is_verbose():
+ logging.info("Downloading: {0}".format(get_args['path']))
+ resp = self.get(WS_EXPORT, get_args)
+ if resp.get('error', None):
+ resp['path'] = notebook_path
+ logging_utils.log_response_error(error_logger, resp)
+ return resp
+ if resp.get('error_code', None):
+ resp['path'] = notebook_path
+ logging_utils.log_response_error(error_logger, resp)
+ return resp
+ nb_path = os.path.dirname(notebook_path)
+ if nb_path != '/':
+ # path is NOT empty, remove the trailing slash from export_dir
+ save_path = export_dir[:-1] + nb_path + '/'
+ else:
+ save_path = export_dir
+
+ # If the local path doesn't exist,we create it before we save the contents
+ if not os.path.exists(save_path) and save_path:
+ os.makedirs(save_path, exist_ok=True)
+
+ save_filename = save_path + os.path.basename(notebook_path) + '.' + resp.get('file_type')
+ if os.path.isfile(save_filename):
+ logging.warning(f"Notebook file {save_filename} already exists; please rename in source workspace. "
+ f"Note that files are case-insensitive")
+ return {}
+ logging.info(save_filename)
+ with open(save_filename, "wb") as f:
+ f.write(base64.b64decode(resp['content']))
+ checkpoint_notebook_set.write(notebook_path)
+ return {'path': notebook_path}
+
+ def filter_workspace_items(self, item_list, item_type):
+ """
+ Helper function to filter on different workspace types.
+ :param item_list: iterable of workspace items
+ :param item_type: DIRECTORY, NOTEBOOK, LIBRARY
+ :return: list of items filtered by type
+ """
+ supported_types = {'DIRECTORY', 'NOTEBOOK', 'LIBRARY'}
+ if item_type not in supported_types:
+ raise ValueError('Unsupported type provided: {0}.\n. Supported types: {1}'.format(item_type,
+ str(supported_types)))
+ filtered_list = list(self.my_map(lambda y: {'path': y.get('path', None),
+ 'object_id': y.get('object_id', None)},
+ filter(lambda x: x.get('object_type', None) == item_type, item_list)))
+ return filtered_list
+
+ def init_workspace_logfiles(self, workspace_log_file='user_workspace.log',
+ libs_log_file='libraries.log', workspace_dir_log_file='user_dirs.log'):
+ """
+ initialize the logfile locations since we run a recursive function to download notebooks
+ """
+ workspace_log = self.get_export_dir() + workspace_log_file
+ libs_log = self.get_export_dir() + libs_log_file
+ workspace_dir_log = self.get_export_dir() + workspace_dir_log_file
+ if not self._checkpoint_service.checkpoint_file_exists(wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT):
+ if os.path.exists(workspace_log):
+ os.remove(workspace_log)
+ if os.path.exists(workspace_dir_log):
+ os.remove(workspace_dir_log)
+ if os.path.exists(libs_log):
+ os.remove(libs_log)
+
+ def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_workspace.log', libs_log_file='libraries.log', dir_log_file='user_dirs.log', repos_log_file='repos.log', exclude_prefixes=[]):
+ logging.info(f"Skip all paths with the following prefixes: {exclude_prefixes}")
+
+ workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + workspace_log_file, "a")
+ libs_log_writer = ThreadSafeWriter(self.get_export_dir() + libs_log_file, "a")
+ dir_log_writer = ThreadSafeWriter(self.get_export_dir() + dir_log_file, "a")
+ #repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a")
+ checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT
+ )
+ try:
+ num_nbs = self.log_all_workspace_items(ws_path=ws_path,
+ workspace_log_writer=workspace_log_writer,
+ libs_log_writer=libs_log_writer,
+ dir_log_writer=dir_log_writer,
+ repos_log_writer=None,
+ checkpoint_set=checkpoint_item_log_set,
+ exclude_prefixes=exclude_prefixes)
+ finally:
+ workspace_log_writer.close()
+ libs_log_writer.close()
+ dir_log_writer.close()
+ #repos_log_writer.close()
+
+ return num_nbs
+
+ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer, dir_log_writer, repos_log_writer, checkpoint_set, exclude_prefixes=[]):
+ """
+ Loop and log all workspace items to download them at a later time
+ :param ws_path: root path to log all the items of the notebook workspace
+ :param workspace_log_file: logfile to store all the paths of the notebooks
+ :param libs_log_file: library logfile to store workspace libraries
+ :param dir_log_file: log directory for users
+ :return:
+ """
+ # define log file names for notebooks, folders, and libraries
+ if ws_path == '/':
+ # default is the root path
+ get_args = {'path': '/'}
+ else:
+ get_args = {'path': ws_path}
+
+ if not os.path.exists(self.get_export_dir()):
+ os.makedirs(self.get_export_dir(), exist_ok=True)
+ items = self.get(WS_LIST, get_args).get('objects', None)
+ #repos = self.get(REPOS).get('repos', None)
+ num_nbs = 0
+ if self.is_verbose():
+ logging.info("Listing: {0}".format(get_args['path']))
+ if items:
+ # list all the users folders only
+ folders = self.filter_workspace_items(items, 'DIRECTORY')
+ # should be no notebooks, but lets filter and can check later
+ notebooks = self.filter_workspace_items(items, 'NOTEBOOK')
+ libraries = self.filter_workspace_items(items, 'LIBRARY')
+ # only get user list if we are filtering by group
+ # ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else []
+ ws_users = self.scim_client.get_active_users() if self.groups_to_keep else []
+
+ for x in notebooks:
+ # notebook objects has path and object_id
+ nb_path = x.get('path')
+
+ # if the current user is not in kept groups, skip this nb
+ if self.groups_to_keep and self.is_user_ws_item(nb_path):
+ nb_user = self.get_user(nb_path)
+ user_groups = [group.get("display") for user in ws_users if user.get("emails")[0].get("value") == nb_user for group in user.get("groups")]
+ if not set(user_groups).intersection(set(self.groups_to_keep)):
+ if self.is_verbose():
+ logging.info("Skipped notebook path due to group exclusion: {0}".format(x.get('path')))
+ continue
+
+ if not checkpoint_set.contains(nb_path) and not nb_path.startswith(tuple(exclude_prefixes)):
+ if self.is_verbose():
+ logging.info("Saving path: {0}".format(x.get('path')))
+ workspace_log_writer.write(json.dumps(x) + '\n')
+ checkpoint_set.write(nb_path)
+ num_nbs += 1
+ for y in libraries:
+ lib_path = y.get('path')
+
+ # if the current user is not in kept groups, skip this lib
+ if self.groups_to_keep and self.is_user_ws_item(lib_path):
+ nb_user = self.get_user(lib_path)
+ user_groups = [group.get("display") for user in ws_users if user.get("emails")[0].get("value") == nb_user for group in user.get("groups")]
+ if not set(user_groups).intersection(set(self.groups_to_keep)):
+ if self.is_verbose():
+ logging.info("Skipped library path due to group exclusion: {0}".format(lib_path))
+ continue
+
+ if not checkpoint_set.contains(lib_path) and not lib_path.startswith(tuple(exclude_prefixes)):
+ libs_log_writer.write(json.dumps(y) + '\n')
+ checkpoint_set.write(lib_path)
+ # log all directories to export permissions
+ if folders:
+ def _recurse_log_all_workspace_items(folder):
+ dir_path = folder.get('path', None)
+ if not self.is_user_trash(dir_path) and not self.is_repo(dir_path):
+ dir_log_writer.write(json.dumps(folder) + '\n')
+ return self.log_all_workspace_items(ws_path=dir_path,
+ workspace_log_writer=workspace_log_writer,
+ libs_log_writer=libs_log_writer,
+ dir_log_writer=dir_log_writer,
+ repos_log_writer=None,
+ checkpoint_set=checkpoint_set,
+ exclude_prefixes=exclude_prefixes)
+
+ for folder in folders:
+ dir_path = folder.get('path', None)
+
+ # if the current user is not in kept groups, skip this dir
+ if self.groups_to_keep and self.is_user_ws_item(dir_path):
+ dir_user = self.get_user(dir_path)
+ user_groups = [group.get("display") for user in ws_users if
+ user.get("emails")[0].get("value") == dir_user for group in user.get("groups")]
+ if not set(user_groups).intersection(set(self.groups_to_keep)):
+ if self.is_verbose():
+ logging.info("Skipped directory due to group exclusion: {0}".format(dir_path))
+ continue
+
+ if not checkpoint_set.contains(dir_path) and not dir_path.startswith(tuple(exclude_prefixes)):
+ num_nbs_plus = _recurse_log_all_workspace_items(folder)
+ checkpoint_set.write(dir_path)
+ if num_nbs_plus:
+ num_nbs += num_nbs_plus
+ # log all repos
+
+ # if repos_log_writer and repos:
+ # for repo in repos:
+ # repo_path = repo.get('path', "")
+ # if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)):
+ # repos_log_writer.write(json.dumps(repo) + '\n')
+ # checkpoint_set.write(repo_path)
+
+ return num_nbs
+
+ def get_obj_id_by_path(self, input_path):
+ resp = self.get(WS_STATUS, {'path': input_path})
+ obj_id = resp.get('object_id', None)
+ return obj_id
+
+ def log_acl_to_file(self, artifact_type, read_log_filename, writer, error_logger, num_parallel):
+ """
+ generic function to log the notebook/directory ACLs to specific file names
+ :param artifact_type: set('notebooks', 'directories') ACLs to be logged
+ :param read_log_filename: the list of the notebook paths / object ids
+ :param write_log_filename: output file to store object_id acls
+ :param error_logger: logger to log errors
+ """
+ read_log_path = self.get_export_dir() + read_log_filename
+ if not os.path.exists(read_log_path):
+ logging.info(f"No log exists for {read_log_path}. Skipping ACL export ...")
+ return
+
+ def _acl_log_helper(json_data):
+ data = json.loads(json_data)
+ obj_id = data.get('object_id', None)
+ alt_id = data.get('id', None)
+
+ if alt_id and not obj_id:
+ obj_id = alt_id
+
+ api_endpoint = '/permissions/{0}/{1}'.format(artifact_type, obj_id)
+ acl_resp = self.get(api_endpoint)
+ acl_resp['path'] = data.get('path')
+ if logging_utils.log_response_error(error_logger, acl_resp):
+ return
+ acl_resp.pop('http_status_code')
+ writer.write(json.dumps(acl_resp) + '\n')
+
+ with open(read_log_path, 'r', encoding='utf-8') as read_fp:
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(_acl_log_helper, json_data) for json_data in read_fp]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ def log_all_workspace_acls(self, workspace_log_file='user_workspace.log',
+ dir_log_file='user_dirs.log',
+ repo_log_file="repos.log",
+ num_parallel=4):
+ """
+ loop through all notebooks and directories to store their associated ACLs
+ :param workspace_log_file: input file for user notebook listing
+ :param dir_log_file: input file for user directory listing
+ :param repo_log_file: input file for repo listing
+ """
+ # define log file names for notebooks, folders, and libraries
+ logging.info("Exporting the notebook permissions")
+ start = timer()
+ acl_notebooks_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir())
+ acl_notebooks_writer = ThreadSafeWriter(self.get_export_dir() + "acl_notebooks.log", "w")
+ try:
+ self.log_acl_to_file('notebooks', workspace_log_file, acl_notebooks_writer, acl_notebooks_error_logger, num_parallel)
+ finally:
+ acl_notebooks_writer.close()
+ end = timer()
+ logging.info("Complete Notebook ACLs Export Time: " + str(timedelta(seconds=end - start)))
+
+ logging.info("Exporting the directories permissions")
+ start = timer()
+ acl_directory_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir())
+ acl_directory_writer = ThreadSafeWriter(self.get_export_dir() + "acl_directories.log", "w")
+ try:
+ self.log_acl_to_file('directories', dir_log_file, acl_directory_writer, acl_directory_error_logger, num_parallel)
+ finally:
+ acl_directory_writer.close()
+ end = timer()
+ logging.info("Complete Directories ACLs Export Time: " + str(timedelta(seconds=end - start)))
+
+ logging.info("Exporting the repo permissions")
+ start = timer()
+ acl_repo_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_EXPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT, self.get_export_dir())
+ acl_repo_writer = ThreadSafeWriter(self.get_export_dir() + "acl_repos.log", "w")
+ try:
+ self.log_acl_to_file('repos', repo_log_file, acl_repo_writer, acl_repo_error_logger,
+ num_parallel)
+ finally:
+ acl_repo_writer.close()
+ end = timer()
+ logging.info("Complete Repo ACLs Export Time: " + str(timedelta(seconds=end - start)))
+
+ def apply_acl_on_object(self, acl_str, error_logger, checkpoint_key_set):
+ """
+ apply the acl definition to the workspace object
+ object_id comes from the export data which contains '/type/id' format for this key
+ the object_id contains the {{/type/object_id}} format which helps craft the api endpoint
+ setting acl definitions uses the patch rest api verb
+ :param acl_str: the complete string from the logfile. contains object defn and acl lists
+ """
+ object_acl = json.loads(acl_str)
+ # the object_type
+ object_type = object_acl.get('object_type', None)
+ obj_path = object_acl['path']
+ logging.info(f"Working on ACL for path: {obj_path}")
+
+ if not checkpoint_key_set.contains(obj_path):
+ # We cannot modify '/Shared' directory's ACL
+ if obj_path == "/Shared" and object_type == "directory":
+ logging.info("We cannot modify /Shared directory's ACL. Skipping..")
+ checkpoint_key_set.write(obj_path)
+ return
+
+ if self.is_user_ws_item(obj_path):
+ ws_user = self.get_user(obj_path)
+ if not self.does_user_exist(ws_user):
+ logging.info(f"User workspace does not exist: {obj_path}, skipping ACL")
+ return
+ obj_status = self.get(WS_STATUS, {'path': obj_path})
+
+ if self.is_repo(obj_path):
+ if logging_utils.check_error(obj_status):
+ logging.warning(f"Could not apply ACL to repo {obj_path}")
+ return
+
+ if logging_utils.log_response_error(error_logger, obj_status):
+ return
+ logging.info("ws-stat: ", obj_status)
+ current_obj_id = obj_status.get('object_id', None)
+ if not current_obj_id:
+ error_logger.error(f'Object id missing from destination workspace: {obj_status}')
+ return
+ if object_type == 'directory':
+ object_id_with_type = f'/directories/{current_obj_id}'
+ elif object_type == 'notebook':
+ object_id_with_type = f'/notebooks/{current_obj_id}'
+ else:
+ error_logger.error(f'Object for Workspace ACLs is Undefined: {obj_status}')
+ return
+ api_path = '/permissions' + object_id_with_type
+ acl_list = object_acl.get('access_control_list', None)
+ access_control_list = self.build_acl_args(acl_list)
+ if access_control_list:
+ api_args = {'access_control_list': access_control_list}
+ resp = self.patch(api_path, api_args)
+
+ # if skipping non-existing users, add error code to allowlist
+ ignore_error_list = wmconstants.IGNORE_ERROR_LIST
+ if self.skip_missing_users:
+ ignore_error_list.append("RESOURCE_DOES_NOT_EXIST")
+
+ if logging_utils.check_error(resp, ignore_error_list):
+ logging_utils.log_response_error(error_logger, resp)
+ else:
+ checkpoint_key_set.write(obj_path)
+ return
+
+ def import_workspace_acls(self, workspace_log_file='acl_notebooks.log',
+ dir_log_file='acl_directories.log',
+ repo_log_file='acl_repos.log', num_parallel=1):
+ """
+ import the notebook and directory acls by looping over notebook and dir logfiles
+ """
+ dir_acl_logs = self.get_export_dir() + dir_log_file
+ notebook_acl_logs = self.get_export_dir() + workspace_log_file
+ repo_acl_logs = self.get_export_dir() + repo_log_file
+
+ acl_notebooks_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT, self.get_export_dir())
+
+ checkpoint_notebook_acl_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_ACL_OBJECT)
+ with open(notebook_acl_logs, encoding="utf-8") as nb_acls_fp:
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(self.apply_acl_on_object, nb_acl_str, acl_notebooks_error_logger, checkpoint_notebook_acl_set) for nb_acl_str in nb_acls_fp]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ acl_dir_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT, self.get_export_dir())
+ checkpoint_dir_acl_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_DIRECTORY_ACL_OBJECT)
+
+ with open(dir_acl_logs, encoding='utf-8') as dir_acls_fp:
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(self.apply_acl_on_object, dir_acl_str, acl_dir_error_logger, checkpoint_dir_acl_set) for dir_acl_str in dir_acls_fp]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ acl_repo_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT, self.get_export_dir())
+ checkpoint_repo_acl_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_ACL_OBJECT)
+
+ with open(repo_acl_logs, encoding='utf-8') as repo_acls_fp:
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [
+ executor.submit(self.apply_acl_on_object, repo_acl_str, acl_repo_error_logger, checkpoint_repo_acl_set)
+ for repo_acl_str in repo_acls_fp]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ print("Completed import ACLs of Repos, Notebooks and Directories")
+
+ def get_current_users(self):
+ """
+ get the num of defined user home directories in the new workspace
+ if this is 0, we must create the users before importing the notebooks over.
+ we cannot create the users home directory since its a special type of directory
+ """
+ ws_users = self.get(WS_LIST, {'path': '/Users/'}).get('objects', None)
+ if ws_users:
+ return len(ws_users)
+ else:
+ return 0
+
+ def does_user_exist(self, username):
+ """
+ check if the users home dir exists
+ """
+ stat = self.get(WS_STATUS, {'path': '/Users/{0}'.format(username)})
+ if stat.get('object_type', None) == 'DIRECTORY':
+ return True
+ return False
+
+ def does_path_exist(self, dir_path):
+ status_resp = self.get(WS_STATUS, {'path': dir_path})
+ if 'error_code' in status_resp:
+ if status_resp.get('error_code') == 'RESOURCE_DOES_NOT_EXIST':
+ return False
+ else:
+ print('Failure:' + json.dumps(status_resp))
+ return False
+ return True
+
+ def import_current_workspace_items(self, artifact_dir='artifacts/'):
+ src_dir = self.get_export_dir() + artifact_dir
+ error_logger = logging_utils.get_error_logger(wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
+ self.get_export_dir())
+ for root, subdirs, files in self.walk(src_dir):
+ # replace the local directory with empty string to get the notebook workspace directory
+ nb_dir = '/' + root.replace(src_dir, '')
+ upload_dir = nb_dir
+ if not nb_dir == '/':
+ upload_dir = nb_dir + '/'
+ if not self.does_path_exist(upload_dir):
+ resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
+ if 'error_code' in resp_mkdirs:
+ logging_utils.log_response_error(error_logger, resp_mkdirs)
+ for f in files:
+ logging.info("Uploading: {0}".format(f))
+ # create the local file path to load the DBC file
+ local_file_path = os.path.join(root, f)
+ # create the ws full file path including filename
+ ws_file_path = upload_dir + f
+ # generate json args with binary data for notebook to upload to the workspace path
+ nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
+ # call import to the workspace
+ if self.is_verbose():
+ logging.info("Path: {0}".format(nb_input_args['path']))
+ resp_upload = self.post(WS_IMPORT, nb_input_args)
+ if 'error_code' in resp_upload:
+ resp_upload['path'] = nb_input_args['path']
+ logging_utils.log_response_error(error_logger, resp_upload)
+
+ def import_all_workspace_items(self, artifact_dir='artifacts/',
+ archive_missing=False, num_parallel=4, last_session=""):
+ """
+ import all notebooks into a new workspace. Walks the entire artifacts/ directory in parallel, and also
+ upload all the files in each of the directories in parallel.
+
+ WARNING: Because it parallelizes both on directory walking and file uploading, it can spawn as many threads as
+ num_parallel * num_parallel
+
+ :param artifact_dir: notebook download directory
+ :param failed_log: failed import log
+ :param archive_missing: whether to put missing users into a /Archive/ top level directory
+ :param last_session: a previous session against which the current session will be compared. Only the changed ahd new notebooks will be imported if last_session is defiined.
+ """
+ src_dir = self.get_export_dir() + artifact_dir
+ error_logger = logging_utils.get_error_logger(wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT,
+ self.get_export_dir())
+
+ # Given previous exported artifacts, a list of changed and newly added notebooks will be logged at notebook_changes.log
+ changes_since_last = set()
+ if last_session:
+ nb_changes_log = os.path.join(self.get_export_dir(), "notebook_changes.log")
+ base_dir = os.path.split(os.path.normpath(self.get_export_dir()))[0]
+ last_src_dir = os.path.join(base_dir, last_session, artifact_dir)
+ changes_since_last = get_updated_new_files(last_src_dir, src_dir)
+ log_file_changes(changes_since_last, nb_changes_log)
+
+ checkpoint_notebook_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_NOTEBOOK_OBJECT)
+ num_exported_users = self.get_num_of_saved_users(src_dir)
+ num_current_users = self.get_current_users()
+ if num_current_users == 0:
+ logging.info("No registered users in existing environment. Please import users / groups first.")
+ raise ValueError("No registered users in the current environment")
+ if (num_current_users < num_exported_users) and (not archive_missing):
+ logging.info("Exported number of user workspaces: {0}".format(num_exported_users))
+ logging.info("Current number of user workspaces: {0}".format(num_current_users))
+ logging.info("Re-run with the `--archive-missing` flag to load missing users into a separate directory")
+ raise ValueError("Current number of users is less than number of user workspaces to import.")
+ archive_users = set()
+
+ def _upload_all_files(root, subdirs, files):
+ '''
+ Upload all files in parallel in root (current) directory.
+ '''
+ # replace the local directory with empty string to get the notebook workspace directory
+ nb_dir = '/' + root.replace(src_dir, '')
+ upload_dir = nb_dir
+ if not nb_dir == '/':
+ upload_dir = nb_dir + '/'
+ if self.is_user_ws_item(upload_dir):
+ ws_user = self.get_user(upload_dir)
+ if archive_missing:
+ if ws_user in archive_users:
+ upload_dir = upload_dir.replace('Users', 'Archive', 1)
+ elif not self.does_user_exist(ws_user):
+ # add the user to the cache / set of missing users
+ logging.info("User workspace does not exist, adding to archive cache: {0}".format(ws_user))
+ archive_users.add(ws_user)
+ # append the archive path to the upload directory
+ upload_dir = upload_dir.replace('Users', 'Archive', 1)
+ else:
+ logging.info("User workspace exists: {0}".format(ws_user))
+ elif not self.does_user_exist(ws_user):
+ logging.info("User {0} is missing. "
+ "Please re-run with --archive-missing flag "
+ "or first verify all users exist in the new workspace".format(ws_user))
+ return
+ else:
+ logging.info("Uploading for user: {0}".format(ws_user))
+ # make the top level folder before uploading files within the loop
+ if not self.is_user_ws_root(upload_dir):
+ # if it is not the /Users/example@example.com/ root path, don't create the folder
+ resp_mkdirs = self.post(WS_MKDIRS, {'path': upload_dir})
+ if 'error_code' in resp_mkdirs:
+ resp_mkdirs['path'] = upload_dir
+ logging_utils.log_response_error(error_logger, resp_mkdirs)
+
+ def _file_upload_helper(f):
+ logging.info("Uploading: {0}".format(f))
+ # create the local file path to load the DBC file
+ local_file_path = os.path.join(root, f)
+ # create the ws full file path including filename
+ ws_file_path = upload_dir + f
+ if checkpoint_notebook_set.contains(ws_file_path):
+ return
+ if changes_since_last:
+ if local_file_path not in changes_since_last:
+ print(f"Skipping {f} because it has not been changed.")
+ return
+ else:
+ print(f"Importing {f} because it has been changed.")
+ # generate json args with binary data for notebook to upload to the workspace path
+ nb_input_args = self.get_user_import_args(local_file_path, ws_file_path)
+ # call import to the workspace
+ if self.is_verbose():
+ logging.info("Path: {0}".format(nb_input_args['path']))
+ resp_upload = self.post(WS_IMPORT, nb_input_args)
+ if 'error_code' in resp_upload:
+ resp_upload['path'] = ws_file_path
+ logging.info(f'Error uploading file: {ws_file_path}')
+ logging_utils.log_response_error(error_logger, resp_upload)
+ else:
+ checkpoint_notebook_set.write(ws_file_path)
+
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(_file_upload_helper, file) for file in files]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [executor.submit(_upload_all_files, walk[0], walk[1], walk[2]) for walk in self.walk(src_dir)]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ def import_all_repos(self, repo_log_file="repos.log", num_parallel=1):
+ dir_repo_logs = self.get_export_dir() + repo_log_file
+
+ # check to see if git creds are set up- repo import will fail if not
+ git_cred_api_path = "/git-credentials"
+ resp = self.get(git_cred_api_path)
+ if not resp.get("credentials", None):
+ logging.info("Repo import will be skipped; repos can only be imported if Git credentials are first set up.")
+ logging.info("To import repos separately, please run repo_importer.py")
+ return
+
+ repo_error_logger = logging_utils.get_error_logger(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_OBJECT, self.get_export_dir())
+ checkpoint_repo_set = self._checkpoint_service.get_checkpoint_key_set(
+ wmconstants.WM_IMPORT, wmconstants.WORKSPACE_REPO_OBJECT)
+
+ with open(dir_repo_logs, encoding='utf-8') as repo_fp:
+ with ThreadPoolExecutor(max_workers=num_parallel) as executor:
+ futures = [
+ executor.submit(self.create_repo, repo_str, repo_error_logger,
+ checkpoint_repo_set)
+ for repo_str in repo_fp]
+ concurrent.futures.wait(futures, return_when="FIRST_EXCEPTION")
+ propagate_exceptions(futures)
+
+ def create_repo(self, repo_str, error_logger, checkpoint_repo_set):
+ api_path = '/repos'
+ repo_json = json.loads(repo_str)
+ repo_url = repo_json.get('url', None)
+ if repo_url:
+ logging.info("Repo: {0}".format(repo_json.get('path', '')))
+ resp = self.post(api_path, repo_json)
+ if (resp.get('error_code') == "RESOURCE_DOES_NOT_EXIST") and \
+ (resp.get('http_status_code') == 404):
+ parent_directory = re.sub(r"^RESOURCE_DOES_NOT_EXIST: Parent directory ", '', resp.get('message'))
+ parent_directory = re.sub(r" does not exist.$", '', parent_directory)
+ if re.fullmatch(
+ r'/Repos/.+[^/]', parent_directory
+ ):
+ logging.info(f"Creating parent directory {parent_directory}")
+ resp2 = self.post('/workspace/mkdirs', {"path": parent_directory})
+ if logging_utils.check_error(resp2):
+ logging_utils.log_response_error(error_logger, resp2)
+ else:
+ logging.info(f"2nd attempt to create: {repo_json.get('path', '')}")
+ resp = self.post(api_path, repo_json)
+ if logging_utils.check_error(resp):
+ logging_utils.log_response_error(error_logger, resp)
+ else:
+ checkpoint_repo_set.write(repo_url)
+ else:
+ logging.info(f"Could not import repo {repo_json.get('path', '')}; only remote repos can be created via API.")
diff --git a/convert_all_logs.py b/convert_all_logs.py
new file mode 100644
index 0000000..90d56e8
--- /dev/null
+++ b/convert_all_logs.py
@@ -0,0 +1,117 @@
+###################### importing other scripts ##############################################
+from utils import to_csv as util
+from utils import create_asset_mapping_spreadsheet as create_spreadsheet
+############################################################################################
+import argparse
+import os
+
+def main(checkpoint, destination="csv"):
+ # where you want the csv files to be located
+ # make the csv directory if its not there
+ if destination not in os.listdir():
+ print(f"Creating {destination}...")
+ os.mkdir(f"./{destination}")
+
+ # users
+ users_data = util.read_log("users.log", checkpoint)
+ if users_data == 1:
+ print("users.log not found in checkpoint session")
+ else:
+ users_df = util.create_users(users_data)
+ util.save_to_csv(users_df, "users.csv", destination)
+
+ # instance profiles
+ ip_data = util.read_log("instance_profiles.log", checkpoint)
+ if ip_data == 1: # file not found
+ print("instance_profiles.log not found in checkpoint session. Skipping...")
+ else:
+ ip_df = util.create_instance_profiles(ip_data)
+ util.save_to_csv(ip_df, "instance_profiles.csv", destination)
+
+ # instance pools
+ ipo_data = util.read_log("instance_pools.log", checkpoint)
+ if ipo_data == 1: #file not found
+ print("instance_pools.log not found in checkpoint session. Skipping...")
+ else:
+ ipo_df = util.create_instance_pools(ipo_data)
+ util.save_to_csv(ipo_df, "instance_pools.csv", destination)
+
+ # groups
+ groups_df = util.create_groups("groups", checkpoint)
+ if groups_df == 1:
+ print("groups.log not found in checkpoint session. Skipping...")
+ util.save_to_csv(groups_df, "groups.csv", destination)
+
+ # clusters
+ clusters_data = util.read_log("clusters.log", checkpoint)
+ if clusters_data ==1 : #file not found
+ print("clusters.log not found in checkpoint session. Skipping... ")
+ else:
+ clusters_df = util.create_clusters(clusters_data)
+ util.save_to_csv(clusters_df, "clusters.csv", destination)
+
+ # cluster policies
+ cluster_policies_data = util.read_log('cluster_policies.log', checkpoint)
+ if cluster_policies_data == 1: #file not found
+ print("cluster_policies.log not found in checkpoint session. Skipping... ")
+ else:
+ clusters_policies_df = util.create_cluster_policies(cluster_policies_data)
+ util.save_to_csv(clusters_policies_df, "cluster_policies.csv", destination)
+
+ # job
+ jobs_data = util.read_log('jobs.log', checkpoint)
+ if jobs_data == 1: #file not found
+ print("jobs.log not found in checkpoint session. Skipping... ")
+ else:
+ jobs_acls = util.read_log('acl_jobs.log', checkpoint)
+ jobs_df = util.create_jobs(jobs_data, jobs_acls)
+ util.save_to_csv(jobs_df, "jobs.csv", destination)
+
+ # shared
+ shared_df = util.create_shared_logs("artifacts/Shared", checkpoint)
+ if shared_df == 1: #file not found
+ print("Shared notebooks not found in checkpoint session. Skipping... ")
+ util.save_to_csv(shared_df, 'global_shared_logs.csv', destination)
+
+ # other artificats
+ other_df = util.create_other_artifacts("artifacts", checkpoint)
+ if other_df == 1: #file not found
+ print("Global artifacts not found in checkpoint session. Skipping... ")
+ util.save_to_csv(other_df, "global_logs.csv", destination)
+
+ # libraries
+ libraries_data = util.read_log("libraries.log", checkpoint)
+ if libraries_data == 1: # not found
+ print("libraries.log not found in checkpoint session. Skipping...")
+ else:
+ libraries_df = util.create_libraries(libraries_data)
+ util.save_to_csv(libraries_df, "libraries.csv", destination)
+
+ # secret scopes
+ scopes_df = util.create_scopes("secret_scopes", checkpoint)
+ if scopes_df == 1:
+ print("secret_scopes.log not found in checkpoint session. Skipping...")
+ util.save_to_csv(scopes_df, "secret_scopes.csv", destination)
+
+ # just databases
+ databases_df = util.create_database(checkpoint, directory_name = 'metastore')
+ if databases_df == 1:
+ print("metastore.log not found in checkpoint session. Skipping...")
+ util.save_to_csv(databases_df, "databases.csv", destination)
+
+ # entire metastore
+ metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore')
+ if metastore_df == 1:
+ print("metastore.log not found in checkpoint session. Skipping...")
+ util.save_to_csv(metastore_df, "metastore.csv", destination)
+
+ create_spreadsheet.csv_to_excel(f"./{destination}")
+ print("Successfully created spreadsheet asset_mapping.xlsx. ")
+
+if __name__ == "__main__":
+ all_args = argparse.ArgumentParser()
+ all_args.add_argument("--checkpoint", "--session", dest="checkpoint", default="", help="set if you are using a checkpoint during export")
+ all_args.add_argument("--destination", dest="destination", default="csv", help="destination of converted logs (default: /csv)")
+
+ args = all_args.parse_args()
+ main(args.checkpoint, args.destination)
diff --git a/data/aws_cluster.json b/data/aws_cluster.json
index 6ad6a20..012abc3 100644
--- a/data/aws_cluster.json
+++ b/data/aws_cluster.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration",
"spark_version": "10.4.x-scala2.12",
"aws_attributes": {
"first_on_demand": 1,
diff --git a/data/aws_cluster_hipaa.json b/data/aws_cluster_hipaa.json
index 40b9a2a..9ab7424 100644
--- a/data/aws_cluster_hipaa.json
+++ b/data/aws_cluster_hipaa.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration",
"spark_version": "10.4.x-scala2.12",
"aws_attributes": {
"first_on_demand": 1,
diff --git a/data/aws_cluster_table_acls.json b/data/aws_cluster_table_acls.json
index b3f521e..94738c2 100644
--- a/data/aws_cluster_table_acls.json
+++ b/data/aws_cluster_table_acls.json
@@ -1,13 +1,9 @@
{
"num_workers": 1,
- "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration_Table_ACLs",
"spark_version": "10.4.x-scala2.12",
"spark_conf": {
- "spark.databricks.cluster.profile": "serverless",
- "spark.databricks.repl.allowedLanguages": "python,sql",
- "spark.databricks.acl.dfAclsEnabled": "true",
- "spark.sql.hive.metastore.version": "1.2.1",
- "spark.sql.hive.metastore.jars": "maven"
+ "spark.databricks.acl.dfAclsEnabled": "true"
},
"aws_attributes": {
"first_on_demand": 1,
diff --git a/data/aws_cluster_table_acls_hipaa.json b/data/aws_cluster_table_acls_hipaa.json
index 781f376..4610061 100644
--- a/data/aws_cluster_table_acls_hipaa.json
+++ b/data/aws_cluster_table_acls_hipaa.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration_Table_ACLs",
"spark_version": "10.4.x-scala2.12",
"spark_conf": {
"spark.databricks.cluster.profile": "serverless",
diff --git a/data/azure_cluster.json b/data/azure_cluster.json
index e2054c5..61c42eb 100644
--- a/data/azure_cluster.json
+++ b/data/azure_cluster.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "API_Metastore_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration",
"spark_version": "10.4.x-scala2.12",
"spark_conf": {},
"node_type_id": "Standard_D8_v3",
diff --git a/data/azure_cluster_table_acls.json b/data/azure_cluster_table_acls.json
index 7b6e320..7163cc0 100644
--- a/data/azure_cluster_table_acls.json
+++ b/data/azure_cluster_table_acls.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration_Table_ACLs",
"spark_version": "10.4.x-scala2.12",
"spark_conf": {
"spark.databricks.cluster.profile": "serverless",
diff --git a/data/default_jobs_cluster_aws.json b/data/default_jobs_cluster_aws.json
index e6ab94b..c1d58b0 100644
--- a/data/default_jobs_cluster_aws.json
+++ b/data/default_jobs_cluster_aws.json
@@ -1,6 +1,6 @@
{
"num_workers": 8,
- "spark_version": "7.3.x-scala2.12",
+ "spark_version": "14.3.x-scala2.12",
"node_type_id": "i3.xlarge",
"spark_env_vars": {
"PYSPARK_PYTHON": "/databricks/python3/bin/python3"
diff --git a/data/default_jobs_cluster_aws_hipaa.json b/data/default_jobs_cluster_aws_hipaa.json
index da0f2e6..42e12cf 100644
--- a/data/default_jobs_cluster_aws_hipaa.json
+++ b/data/default_jobs_cluster_aws_hipaa.json
@@ -1,6 +1,6 @@
{
"num_workers": 8,
- "spark_version": "7.3.x-scala2.12",
+ "spark_version": "14.3.x-scala2.12",
"node_type_id": "i4i.xlarge",
"spark_env_vars": {
"PYSPARK_PYTHON": "/databricks/python3/bin/python3"
diff --git a/data/gcp_cluster.json b/data/gcp_cluster.json
index 3a7c07e..0f0a70c 100644
--- a/data/gcp_cluster.json
+++ b/data/gcp_cluster.json
@@ -1,6 +1,6 @@
{
"num_workers": 1,
- "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration",
"spark_version": "10.4.x-scala2.12",
"gcp_attributes": {
"first_on_demand": 1
diff --git a/data/gcp_cluster_table_acls.json b/data/gcp_cluster_table_acls.json
index 062c24b..3b27540 100644
--- a/data/gcp_cluster_table_acls.json
+++ b/data/gcp_cluster_table_acls.json
@@ -1,5 +1,5 @@
{
- "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+ "cluster_name": "E2_Migration_Table_ACLs",
"spark_version": "10.4.x-scala2.12",
"spark_conf": {
"spark.databricks.cluster.profile": "serverless",
diff --git a/data/nitro_mapping.csv b/data/nitro_mapping.csv
new file mode 100644
index 0000000..74bde2e
--- /dev/null
+++ b/data/nitro_mapping.csv
@@ -0,0 +1,301 @@
+PVC Instance Type,Recommended Nitro Instance Type,
+m4.large,m5n.large,FALSE
+m4.xlarge,m5n.xlarge,FALSE
+m4.2xlarge,m5n.2xlarge,FALSE
+m4.4xlarge,m5n.4xlarge,FALSE
+m4.10xlarge,m5n.12xlarge,FALSE
+m4.16xlarge,m5n.16xlarge,FALSE
+m5.large,m5n.large,FALSE
+m5.xlarge,m5n.xlarge,FALSE
+m5.2xlarge,m5n.2xlarge,FALSE
+m5.4xlarge,m5n.4xlarge,FALSE
+m5.8xlarge,m5n.8xlarge,FALSE
+m5.12xlarge,m5n.12xlarge,FALSE
+m5.16xlarge,m5n.16xlarge,FALSE
+m5.24xlarge,m5n.24xlarge,FALSE
+m5d.large,m5dn.large,FALSE
+m5d.xlarge,m5dn.xlarge,FALSE
+m5d.2xlarge,m5dn.2xlarge,FALSE
+m5d.4xlarge,m5dn.4xlarge,FALSE
+m5d.8xlarge,m5dn.8xlarge,FALSE
+m5d.12xlarge,m5dn.12xlarge,FALSE
+m5d.16xlarge,m5dn.16xlarge,FALSE
+m5d.24xlarge,m5dn.24xlarge,FALSE
+m5a.large,m5n.large,FALSE
+m5a.xlarge,m5n.xlarge,FALSE
+m5a.2xlarge,m5n.2xlarge,FALSE
+m5a.4xlarge,m5n.4xlarge,FALSE
+m5a.8xlarge,m5n.8xlarge,FALSE
+m5a.12xlarge,m5n.12xlarge,FALSE
+m5a.16xlarge,m5n.16xlarge,FALSE
+m5a.24xlarge,m5n.24xlarge,FALSE
+m6g.large,m5n.large,FALSE
+m6g.xlarge,m5n.xlarge,FALSE
+m6g.2xlarge,m5n.2xlarge,FALSE
+m6g.4xlarge,m5n.4xlarge,FALSE
+m6g.8xlarge,m5n.8xlarge,FALSE
+m6g.12xlarge,m5n.12xlarge,FALSE
+m6g.16xlarge,m5n.16xlarge,FALSE
+m6gd.large,m5dn.large,FALSE
+m6gd.xlarge,m5dn.xlarge,FALSE
+m6gd.2xlarge,m5dn.2xlarge,FALSE
+m6gd.4xlarge,m5dn.4xlarge,FALSE
+m6gd.8xlarge,m5dn.8xlarge,FALSE
+m6gd.12xlarge,m5dn.12xlarge,FALSE
+m6gd.16xlarge,m5dn.16xlarge,FALSE
+c4.2xlarge,c5a.2xlarge,FALSE
+c4.4xlarge,c5a.4xlarge,FALSE
+c4.8xlarge,c5a.8xlarge,FALSE
+c5.xlarge,c5a.xlarge,FALSE
+c5.2xlarge,c5a.2xlarge,FALSE
+c5.4xlarge,c5a.4xlarge,FALSE
+c5.9xlarge,c5a.8xlarge,FALSE
+c5.12xlarge,c5a.12xlarge,FALSE
+c5.18xlarge,c5a.16xlarge,FALSE
+c5.24xlarge,c5a.24xlarge,FALSE
+c5d.xlarge,c5ad.xlarge,FALSE
+c5d.2xlarge,c5ad.2xlarge,FALSE
+c5d.4xlarge,c5ad.4xlarge,FALSE
+c5d.9xlarge,c5ad.8xlarge,FALSE
+c5d.12xlarge,c5ad.12xlarge,FALSE
+c5d.18xlarge,c5ad.16xlarge,FALSE
+c5d.24xlarge,c5ad.24xlarge,FALSE
+c6g.xlarge,c5a.xlarge,FALSE
+c6g.2xlarge,c5a.2xlarge,FALSE
+c6g.4xlarge,c5a.4xlarge,FALSE
+c6g.8xlarge,c5a.8xlarge,FALSE
+c6g.12xlarge,c5a.12xlarge,FALSE
+c6g.16xlarge,c5a.16xlarge,FALSE
+c6gd.xlarge,c5ad.xlarge,FALSE
+c6gd.2xlarge,c5ad.2xlarge,FALSE
+c6gd.4xlarge,c5ad.4xlarge,FALSE
+c6gd.8xlarge,c5ad.8xlarge,FALSE
+c6gd.12xlarge,c5ad.12xlarge,FALSE
+c6gd.16xlarge,c5ad.16xlarge,FALSE
+r3.xlarge,r5n.xlarge,FALSE
+r3.2xlarge,r5n.2xlarge,FALSE
+r3.4xlarge,r5n.4xlarge,FALSE
+r3.8xlarge,r5n.8xlarge,FALSE
+r4.xlarge,r5n.xlarge,FALSE
+r4.2xlarge,r5n.2xlarge,FALSE
+r4.4xlarge,r5n.4xlarge,FALSE
+r4.8xlarge,r5n.8xlarge,FALSE
+r4.16xlarge,r5n.16xlarge,FALSE
+r5.large,r5n.large,FALSE
+r5.xlarge,r5n.xlarge,FALSE
+r5.2xlarge,r5n.2xlarge,FALSE
+r5.4xlarge,r5n.4xlarge,FALSE
+r5.8xlarge,r5n.8xlarge,FALSE
+r5.12xlarge,r5n.12xlarge,FALSE
+r5.16xlarge,r5n.16xlarge,FALSE
+r5.24xlarge,r5n.24xlarge,FALSE
+r5d.large,r5dn.large,FALSE
+r5d.xlarge,r5dn.xlarge,FALSE
+r5d.2xlarge,r5dn.2xlarge,FALSE
+r5d.4xlarge,r5dn.4xlarge,FALSE
+r5d.8xlarge,r5dn.8xlarge,FALSE
+r5d.12xlarge,r5dn.12xlarge,FALSE
+r5d.16xlarge,r5dn.16xlarge,FALSE
+r5d.24xlarge,r5dn.24xlarge,FALSE
+r5a.large,r5n.large,FALSE
+r5a.xlarge,r5n.xlarge,FALSE
+r5a.2xlarge,r5n.2xlarge,FALSE
+r5a.4xlarge,r5n.4xlarge,FALSE
+r5a.8xlarge,r5n.8xlarge,FALSE
+r5a.12xlarge,r5n.12xlarge,FALSE
+r5a.16xlarge,r5n.16xlarge,FALSE
+r5a.24xlarge,r5n.24xlarge,FALSE
+r6g.large,r5n.large,FALSE
+r6g.xlarge,r5n.xlarge,FALSE
+r6g.2xlarge,r5n.2xlarge,FALSE
+r6g.4xlarge,r5n.4xlarge,FALSE
+r6g.8xlarge,r5n.8xlarge,FALSE
+r6g.12xlarge,r5n.12xlarge,FALSE
+r6g.16xlarge,r5n.16xlarge,FALSE
+r6gd.large,r5dn.large,FALSE
+r6gd.xlarge,r5dn.xlarge,FALSE
+r6gd.2xlarge,r5dn.2xlarge,FALSE
+r6gd.4xlarge,r5dn.4xlarge,FALSE
+r6gd.8xlarge,r5dn.8xlarge,FALSE
+r6gd.12xlarge,r5dn.12xlarge,FALSE
+r6gd.16xlarge,r5dn.16xlarge,FALSE
+i3.large,i4i.large,FALSE
+i3.xlarge,i4i.xlarge,FALSE
+i3.2xlarge,i4i.2xlarge,FALSE
+i3.4xlarge,i4i.4xlarge,FALSE
+i3.8xlarge,i4i.8xlarge,FALSE
+i3.16xlarge,i4i.16xlarge,FALSE
+i2.xlarge,i3en.xlarge,FALSE
+i2.2xlarge,i3en.2xlarge,FALSE
+i2.4xlarge,i3en.3xlarge,FALSE
+i2.8xlarge,i3en.6xlarge,FALSE
+p2.xlargeGPU,g4ad.4xlargeGPU,FALSE
+p2.8xlargeGPU,g4ad.8xlargeGPU,FALSE
+p2.16xlargeGPU,g4ad.16xlargeGPU,FALSE
+p3.2xlargeGPU,g4ad.4xlargeGPU,FALSE
+p3.8xlargeGPU,g4ad.8xlargeGPU,FALSE
+p3.16xlargeGPU,g4ad.16xlargeGPU,FALSE
+g5.xlargeGPU,g4dn.xlargeGPU,FALSE
+g5.2xlargeGPU,g4dn.2xlargeGPU,FALSE
+g5.4xlargeGPU,g4dn.4xlargeGPU,FALSE
+g5.8xlargeGPU,g4dn.8xlargeGPU,FALSE
+g5.16xlargeGPU,g4dn.12xlargeGPU,FALSE
+g5.12xlargeGPU,g4dn.16xlargeGPU,FALSE
+g5.24xlargeGPU,p3dn.24xlargeGPU,FALSE
+g5.48xlargeGPU,p3dn.24xlargeGPU,FALSE
+z1d.large,r5n.large,FALSE
+z1d.xlarge,r5n.xlarge,FALSE
+z1d.2xlarge,r5n.2xlarge,FALSE
+z1d.3xlarge,r5n.4xlarge,FALSE
+z1d.6xlarge,r5n.8xlarge,FALSE
+z1d.12xlarge,r5n.12xlarge,FALSE
+m5n.large,m5n.large,FALSE
+m5n.xlarge,m5n.xlarge,FALSE
+m5n.2xlarge,m5n.2xlarge,FALSE
+m5n.4xlarge,m5n.4xlarge,FALSE
+m5n.12xlarge,m5n.12xlarge,FALSE
+m5n.16xlarge,m5n.16xlarge,FALSE
+m5n.large,m5n.large,FALSE
+m5n.xlarge,m5n.xlarge,FALSE
+m5n.2xlarge,m5n.2xlarge,FALSE
+m5n.4xlarge,m5n.4xlarge,FALSE
+m5n.8xlarge,m5n.8xlarge,FALSE
+m5n.12xlarge,m5n.12xlarge,FALSE
+m5n.16xlarge,m5n.16xlarge,FALSE
+m5n.24xlarge,m5n.24xlarge,FALSE
+m5dn.large,m5dn.large,FALSE
+m5dn.xlarge,m5dn.xlarge,FALSE
+m5dn.2xlarge,m5dn.2xlarge,FALSE
+m5dn.4xlarge,m5dn.4xlarge,FALSE
+m5dn.8xlarge,m5dn.8xlarge,FALSE
+m5dn.12xlarge,m5dn.12xlarge,FALSE
+m5dn.16xlarge,m5dn.16xlarge,FALSE
+m5dn.24xlarge,m5dn.24xlarge,FALSE
+m5n.large,m5n.large,FALSE
+m5n.xlarge,m5n.xlarge,FALSE
+m5n.2xlarge,m5n.2xlarge,FALSE
+m5n.4xlarge,m5n.4xlarge,FALSE
+m5n.8xlarge,m5n.8xlarge,FALSE
+m5n.12xlarge,m5n.12xlarge,FALSE
+m5n.16xlarge,m5n.16xlarge,FALSE
+m5n.24xlarge,m5n.24xlarge,FALSE
+m5n.large,m5n.large,FALSE
+m5n.xlarge,m5n.xlarge,FALSE
+m5n.2xlarge,m5n.2xlarge,FALSE
+m5n.4xlarge,m5n.4xlarge,FALSE
+m5n.8xlarge,m5n.8xlarge,FALSE
+m5n.12xlarge,m5n.12xlarge,FALSE
+m5n.16xlarge,m5n.16xlarge,FALSE
+m5dn.large,m5dn.large,FALSE
+m5dn.xlarge,m5dn.xlarge,FALSE
+m5dn.2xlarge,m5dn.2xlarge,FALSE
+m5dn.4xlarge,m5dn.4xlarge,FALSE
+m5dn.8xlarge,m5dn.8xlarge,FALSE
+m5dn.12xlarge,m5dn.12xlarge,FALSE
+m5dn.16xlarge,m5dn.16xlarge,FALSE
+c5a.2xlarge,c5a.2xlarge,FALSE
+c5a.4xlarge,c5a.4xlarge,FALSE
+c5a.8xlarge,c5a.8xlarge,FALSE
+c5a.xlarge,c5a.xlarge,FALSE
+c5a.2xlarge,c5a.2xlarge,FALSE
+c5a.4xlarge,c5a.4xlarge,FALSE
+c5a.8xlarge,c5a.8xlarge,FALSE
+c5a.12xlarge,c5a.12xlarge,FALSE
+c5a.16xlarge,c5a.16xlarge,FALSE
+c5a.24xlarge,c5a.24xlarge,FALSE
+c5ad.xlarge,c5ad.xlarge,FALSE
+c5ad.2xlarge,c5ad.2xlarge,FALSE
+c5ad.4xlarge,c5ad.4xlarge,FALSE
+c5ad.8xlarge,c5ad.8xlarge,FALSE
+c5ad.12xlarge,c5ad.12xlarge,FALSE
+c5ad.16xlarge,c5ad.16xlarge,FALSE
+c5ad.24xlarge,c5ad.24xlarge,FALSE
+c5a.xlarge,c5a.xlarge,FALSE
+c5a.2xlarge,c5a.2xlarge,FALSE
+c5a.4xlarge,c5a.4xlarge,FALSE
+c5a.8xlarge,c5a.8xlarge,FALSE
+c5a.12xlarge,c5a.12xlarge,FALSE
+c5a.16xlarge,c5a.16xlarge,FALSE
+c5ad.xlarge,c5ad.xlarge,FALSE
+c5ad.2xlarge,c5ad.2xlarge,FALSE
+c5ad.4xlarge,c5ad.4xlarge,FALSE
+c5ad.8xlarge,c5ad.8xlarge,FALSE
+c5ad.12xlarge,c5ad.12xlarge,FALSE
+c5ad.16xlarge,c5ad.16xlarge,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.16xlarge,r5n.16xlarge,FALSE
+r5n.large,r5n.large,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.12xlarge,r5n.12xlarge,FALSE
+r5n.16xlarge,r5n.16xlarge,FALSE
+r5n.24xlarge,r5n.24xlarge,FALSE
+r5dn.large,r5dn.large,FALSE
+r5dn.xlarge,r5dn.xlarge,FALSE
+r5dn.2xlarge,r5dn.2xlarge,FALSE
+r5dn.4xlarge,r5dn.4xlarge,FALSE
+r5dn.8xlarge,r5dn.8xlarge,FALSE
+r5dn.12xlarge,r5dn.12xlarge,FALSE
+r5dn.16xlarge,r5dn.16xlarge,FALSE
+r5dn.24xlarge,r5dn.24xlarge,FALSE
+r5n.large,r5n.large,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.12xlarge,r5n.12xlarge,FALSE
+r5n.16xlarge,r5n.16xlarge,FALSE
+r5n.24xlarge,r5n.24xlarge,FALSE
+r5n.large,r5n.large,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.12xlarge,r5n.12xlarge,FALSE
+r5n.16xlarge,r5n.16xlarge,FALSE
+r5dn.large,r5dn.large,FALSE
+r5dn.xlarge,r5dn.xlarge,FALSE
+r5dn.2xlarge,r5dn.2xlarge,FALSE
+r5dn.4xlarge,r5dn.4xlarge,FALSE
+r5dn.8xlarge,r5dn.8xlarge,FALSE
+r5dn.12xlarge,r5dn.12xlarge,FALSE
+r5dn.16xlarge,r5dn.16xlarge,FALSE
+i4i.large,i4i.large,FALSE
+i4i.xlarge,i4i.xlarge,FALSE
+i4i.2xlarge,i4i.2xlarge,FALSE
+i4i.4xlarge,i4i.4xlarge,FALSE
+i4i.8xlarge,i4i.8xlarge,FALSE
+i4i.16xlarge,i4i.16xlarge,FALSE
+i3en.xlarge,i3en.xlarge,FALSE
+i3en.2xlarge,i3en.2xlarge,FALSE
+i3en.3xlarge,i3en.3xlarge,FALSE
+i3en.6xlarge,i3en.6xlarge,FALSE
+g4ad.4xlargeGPU,g4ad.4xlargeGPU,FALSE
+g4ad.8xlargeGPU,g4ad.8xlargeGPU,FALSE
+g4ad.16xlargeGPU,g4ad.16xlargeGPU,FALSE
+g4ad.4xlargeGPU,g4ad.4xlargeGPU,FALSE
+g4ad.8xlargeGPU,g4ad.8xlargeGPU,FALSE
+g4ad.16xlargeGPU,g4ad.16xlargeGPU,FALSE
+g4dn.xlargeGPU,g4dn.xlargeGPU,FALSE
+g4dn.2xlargeGPU,g4dn.2xlargeGPU,FALSE
+g4dn.4xlargeGPU,g4dn.4xlargeGPU,FALSE
+g4dn.8xlargeGPU,g4dn.8xlargeGPU,FALSE
+g4dn.12xlargeGPU,g4dn.12xlargeGPU,FALSE
+g4dn.16xlargeGPU,g4dn.16xlargeGPU,FALSE
+p3dn.24xlargeGPU,p3dn.24xlargeGPU,FALSE
+p3dn.24xlargeGPU,p3dn.24xlargeGPU,FALSE
+r5n.large,r5n.large,FALSE
+r5n.xlarge,r5n.xlarge,FALSE
+r5n.2xlarge,r5n.2xlarge,FALSE
+r5n.4xlarge,r5n.4xlarge,FALSE
+r5n.8xlarge,r5n.8xlarge,FALSE
+r5n.12xlarge,r5n.12xlarge,FALSE
diff --git a/data/notebooks/DBFS File Export.py b/data/notebooks/DBFS File Export.py
new file mode 100644
index 0000000..b5295ec
--- /dev/null
+++ b/data/notebooks/DBFS File Export.py
@@ -0,0 +1,48 @@
+# Databricks notebook source
+dbutils.widgets.text("bucket","dbfs:/mnt/....","1: S3 Intermediary Bucket")
+dbutils.widgets.text("dbfs","dbfs:/","2: DBFS Directory")
+
+# COMMAND ----------
+
+from py4j.java_gateway import java_import
+java_import(sc._gateway.jvm, "")
+
+bucket_dest_dir = dbutils.widgets.get("bucket")
+dbfs_source_dir = dbutils.widgets.get("dbfs")
+
+print(f"Getting list of files in the source directory {dbfs_source_dir}...")
+
+# Get list of files in the source directory
+skip_paths = ["dbfs:/mnt/", "dbfs:/databricks/", "dbfs:/databricks-datasets/","dbfs:/databricks-results/"]
+files = dbutils.fs.ls(dbfs_source_dir)
+print(f"Found {len(files)} in source directory.")
+
+# COMMAND ----------
+
+# hadoop_conf = sc._jsc.hadoopConfiguration(): This line is getting the Hadoop configuration from the Java Spark Context. This configuration contains settings for Hadoop and can be used to interact with the Hadoop file system.
+# hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem: This line is accessing the Hadoop FileSystem class via PySpark's JVM gateway. The FileSystem class is a generic class in Hadoop that handles file systems.
+# hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path: This line is accessing the Hadoop Path class via PySpark's JVM gateway. The Path class represents file and directory paths in a Hadoop file system.
+
+hadoop_conf = sc._jsc.hadoopConfiguration()
+hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+
+def copy_file(file):
+ from_path = hadoop_path(file)
+ to_path = hadoop_path(bucket_dest_dir)
+ from_fs = hadoop_fs.get(from_path.toUri(), hadoop_conf)
+ to_fs = hadoop_fs.get(to_path.toUri(), hadoop_conf)
+ print(f"Moving {from_path} to {to_path}")
+ sc._gateway.jvm.org.apache.hadoop.fs.FileUtil.copy(from_fs, from_path, to_fs, to_path, False, hadoop_conf)
+
+
+# Copy each file to the destination directory
+for file in files:
+ file_name = file.path
+ copy_file(file_name)
+
+print("All files copied to the bucket successfully!")
+
+# COMMAND ----------
+
+
diff --git a/data/notebooks/DBFS File Import b/data/notebooks/DBFS File Import
new file mode 100644
index 0000000..84adc05
--- /dev/null
+++ b/data/notebooks/DBFS File Import
@@ -0,0 +1,44 @@
+# Databricks notebook source
+dbutils.widgets.text("bucket","dbfs:/mnt/....","1: S3 Intermediary Bucket")
+dbutils.widgets.text("dbfs","dbfs:/","2: DBFS Directory")
+
+# COMMAND ----------
+
+from py4j.java_gateway import java_import
+java_import(sc._gateway.jvm, "")
+
+bucket_source_dir = dbutils.widgets.get("bucket")
+dbfs_dest_dir = dbutils.widgets.get("dbfs")
+
+print(f"Getting list of files in the source directory {bucket_source_dir}...")
+
+# Get list of files in the source directory
+files = dbutils.fs.ls(bucket_source_dir)
+print(f"Found {len(files)} in source directory.")
+
+# COMMAND ----------
+
+# hadoop_conf = sc._jsc.hadoopConfiguration(): This line is getting the Hadoop configuration from the Java Spark Context. This configuration contains settings for Hadoop and can be used to interact with the Hadoop file system.
+# hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem: This line is accessing the Hadoop FileSystem class via PySpark's JVM gateway. The FileSystem class is a generic class in Hadoop that handles file systems.
+# hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path: This line is accessing the Hadoop Path class via PySpark's JVM gateway. The Path class represents file and directory paths in a Hadoop file system.
+
+hadoop_conf = sc._jsc.hadoopConfiguration()
+hadoop_fs = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+hadoop_path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+
+def copy_file(file):
+ from_path = hadoop_path(file)
+ to_path = hadoop_path(dbfs_dest_dir)
+ from_fs = hadoop_fs.get(from_path.toUri(), hadoop_conf)
+ to_fs = hadoop_fs.get(to_path.toUri(), hadoop_conf)
+ print(f"Moving {from_path} to {to_path}")
+ sc._gateway.jvm.org.apache.hadoop.fs.FileUtil.copy(from_fs, from_path, to_fs, to_path, False, hadoop_conf)
+
+
+# Copy each file to the destination directory
+for file in files:
+ file_name = file.path
+ copy_file(file_name)
+
+print("All files copied to the bucket successfully!")
+# COMMAND ----------
diff --git a/data/notebooks/DBFS Sizing Notebooks.py b/data/notebooks/DBFS Sizing Notebooks.py
new file mode 100644
index 0000000..6620541
--- /dev/null
+++ b/data/notebooks/DBFS Sizing Notebooks.py
@@ -0,0 +1,44 @@
+# Databricks notebook source
+def recursiveDirSize(path):
+ total = 0
+ dir_files = dbutils.fs.ls(path)
+ for file in dir_files:
+ if file.isDir():
+ total += recursiveDirSize(file.path)
+ else:
+ total += file.size
+ return total
+
+
+# COMMAND ----------
+
+dbfs_paths = dbutils.fs.ls("dbfs:/")
+
+paths = []
+sizes = []
+
+skip_paths = ["dbfs:/mnt/", "dbfs:/databricks/", "dbfs:/databricks-datasets/","dbfs:/databricks-results/"]
+
+for p in dbfs_paths:
+ try:
+ print("Working on", p.path)
+ if p.path in skip_paths:
+ continue
+ p_size = recursiveDirSize(p.path)
+ paths.append(p.path)
+ sizes.append(p_size)
+ print("Completed", p.path)
+ except:
+ print(f"Could not find size for path {p}")
+
+# COMMAND ----------
+
+spark.createDataFrame([(i, j/1e6) for i, j in zip(paths, sizes)], schema = ["Path", "Size in MB"]).display()
+
+# COMMAND ----------
+
+spark.createDataFrame([(i, j/1e9) for i, j in zip(paths, sizes)], schema = ["Path", "Size in GB"]).display()
+
+# COMMAND ----------
+
+dbutils.fs.ls("dbfs:/")
diff --git a/data/notebooks/Metastore_Scout.py b/data/notebooks/Metastore_Scout.py
new file mode 100644
index 0000000..218daea
--- /dev/null
+++ b/data/notebooks/Metastore_Scout.py
@@ -0,0 +1,112 @@
+# Databricks notebook source
+!pip install tqdm
+
+# COMMAND ----------
+
+from pyspark.sql.functions import *
+from tqdm import tqdm
+
+# COMMAND ----------
+
+dbutils.widgets.text("database_list", "all")
+database_list = dbutils.widgets.get("database_list").split(",")
+
+dbutils.widgets.text("get_ddl", "false")
+getDDL = dbutils.widgets.get("get_ddl") == "true"
+
+dbutils.widgets.text("calculate_size", "false")
+calculateSize = dbutils.widgets.get("calculate_size") == "true"
+
+# COMMAND ----------
+
+def getAllDatabases():
+ databaseList = spark.sql(f"""SHOW DATABASES""").select("databaseName").rdd.flatMap(lambda x:x).collect()
+ return databaseList
+
+def getAllTables(database):
+ tableList = spark.sql(f"""SHOW TABLES IN {database}""").select("tableName").rdd.flatMap(lambda x:x).collect()
+ databaseAndTableList = [f"{database}.{t}" for t in tableList]
+ return databaseAndTableList
+
+def getTableDetail(table, detail):
+ try:
+ tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0]
+ except Exception as e:
+ tableDetail = "N/A"
+ return tableDetail
+
+def getTableSize(table, calculateSize):
+ if calculateSize:
+ spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN")
+ try:
+ tableSize = (spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes'])
+ if (tableSize == None):
+ tableSize = int(spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == 'Statistics'").select("data_type").rdd.flatMap(lambda x:x).collect()[0].split(' ')[0])
+ except Exception as e:
+ tableSize = -1
+ else:
+ tableSize = -1
+ return tableSize
+
+def getTableDDL(table, getDDL):
+ if getDDL:
+ tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0]
+ else:
+ tableDDL = "N/A"
+ return tableDDL
+
+# COMMAND ----------
+
+def main_scout(database_list):
+
+ if database_list == ['all']:
+ database_list = getAllDatabases()
+
+ print(f"Analyzing {len(database_list)} databases.")
+ fullTableList = []
+
+ for database in database_list:
+ tableList = getAllTables(database)
+ print(f"{database}: {len(tableList)}")
+ fullTableList.extend(tableList)
+
+ print(f"Found {len(fullTableList)} in {len(database_list)} databases.")
+
+ fullTableDetails = []
+ failedTables = []
+
+ for table in tqdm(fullTableList):
+ try:
+ tableType = getTableDetail(table, "Type")
+ tableLocation = getTableDetail(table, "Location")
+ tableProvider = getTableDetail(table, "Provider")
+ tableVersion = getTableDetail(table, "Created By")
+ tableSize = getTableSize(table, calculateSize)
+ tableDDL = getTableDDL(table, getDDL)
+ fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL))
+ except Exception as e:
+ failedTables.append((table, str(e)))
+ continue
+
+ columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"]
+ spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan")
+
+ failedTableSchema = StructType([
+ StructField("table", StringType(),True),
+ StructField("error", StringType(),True)
+ ])
+
+ spark.createDataFrame(data = failedTables, schema = failedTableSchema).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan_errors")
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC CREATE DATABASE IF NOT EXISTS e2_migration_testing_to_delete
+
+# COMMAND ----------
+
+main_scout(database_list)
+
+# COMMAND ----------
+
+
diff --git a/data/notebooks/Metastore_Scout_no_views.py b/data/notebooks/Metastore_Scout_no_views.py
new file mode 100644
index 0000000..a8ec31d
--- /dev/null
+++ b/data/notebooks/Metastore_Scout_no_views.py
@@ -0,0 +1,97 @@
+# Databricks notebook source
+!pip install tqdm
+
+# COMMAND ----------
+
+from pyspark.sql.functions import *
+from tqdm import tqdm
+
+# COMMAND ----------
+
+dbutils.widgets.text("database_list", "")
+database_list = dbutils.widgets.get("database_list").split(",")
+
+# COMMAND ----------
+
+def getAllDatabases():
+ databaseList = spark.sql(f"""SHOW DATABASES""").select("databaseName").rdd.flatMap(lambda x:x).collect()
+ return databaseList
+
+def getAllTables(database):
+ tableList = spark.sql(f"""SHOW TABLES IN {database}""").select("tableName").rdd.flatMap(lambda x:x).collect()
+ views_list = spark.sql("SHOW VIEWS FROM schema_name").select("viewName").rdd.flatMap(lambda x: x).collect()
+ tables_only_list = [x for x in tables_list if x not in views_list]
+ databaseAndTableList = [f"{database}.{t}" for t in tables_only_list]
+ return databaseAndTableList
+
+def getTableDetail(table, detail):
+ try:
+ tableDetail = spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == '{detail}'").select("data_type").rdd.flatMap(lambda x:x).collect()[0]
+ except Exception as e:
+ tableDetail = "N/A"
+ return tableDetail
+
+def getTableSize(table):
+ spark.sql(f"ANALYZE TABLE {table} COMPUTE STATISTICS NOSCAN")
+ try:
+ tableSize = (spark.sql(f"DESCRIBE DETAIL {table}").collect()[0]['sizeInBytes'])
+ if (tableSize == None):
+ tableSize = int(spark.sql(f"""DESC EXTENDED {table}""").filter(f"col_name == 'Statistics'").select("data_type").rdd.flatMap(lambda x:x).collect()[0].split(' ')[0])
+ except Exception as e:
+ tableSize = -1
+ return tableSize
+
+def getTableDDL(table):
+ tableDDL = spark.sql(f"""SHOW CREATE TABLE {table}""").collect()[0][0]
+ return tableDDL
+
+# COMMAND ----------
+
+def main_scout():
+
+ if database_list == ['all']:
+ database_list = getAllDatabases()
+
+ print(f"Analyzing {len(database_list)} databases.")
+ fullTableList = []
+
+ for database in database_list:
+ tableList = getAllTables(database)
+ print(f"{database}: {len(tableList)}")
+ fullTableList.extend(tableList)
+
+ print(f"Found {len(fullTableList)} in {len(database_list)} databases.")
+
+ fullTableDetails = []
+ failedTables = []
+
+ for table in tqdm(fullTableList):
+ try:
+ tableType = getTableDetail(table, "Type")
+ tableLocation = getTableDetail(table, "Location")
+ tableProvider = getTableDetail(table, "Provider")
+ tableVersion = getTableDetail(table, "Created By")
+ tableSize = getTableSize(table)
+ tableDDL = getTableDDL(table)
+ fullTableDetails.append((table, tableType, tableLocation, tableProvider, tableVersion, tableSize, tableDDL))
+ except Exception as e:
+ failedTables.append((table, str(e)))
+ continue
+
+ columns = ["tableName", "tableType", "tableLocation", "tableProvider", "tableVersion", "tableSize", "tableDDL"]
+ spark.createDataFrame(data=fullTableDetails, schema = columns).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan")
+
+ spark.createDataFrame(data = failedTables, schema = ['table', 'error']).write.mode("overwrite").saveAsTable("e2_migration_testing_to_delete.metastore_scan_errors")
+
+# COMMAND ----------
+
+# MAGIC %sql
+# MAGIC CREATE DATABASE IF NOT EXISTS e2_migration_testing_to_delete
+
+# COMMAND ----------
+
+main_scout()
+
+# COMMAND ----------
+
+
diff --git a/data/notebooks/empty_directory_creator.py b/data/notebooks/empty_directory_creator.py
new file mode 100644
index 0000000..351a150
--- /dev/null
+++ b/data/notebooks/empty_directory_creator.py
@@ -0,0 +1,126 @@
+import argparse
+from datetime import timedelta
+import json
+import time
+import requests
+import pandas as pd
+
+def _get_workspace_list(STURL, STTOKEN, path="/"):
+ print(f"Directories under {path}...")
+ requestsURL = STURL + "/api/2.0/workspace/list?path="
+ requestsURL += path
+ headers = {
+ 'Authorization': f'Bearer {STTOKEN}'
+ }
+ payload = {}
+ print(requestsURL)
+ response = requests.request("GET", requestsURL, headers=headers, data=payload)
+ if response.status_code == 200:
+ try:
+ pathsFound = response.json()['objects']
+ dirsFound = [obj for obj in pathsFound if obj.get("object_type") == "DIRECTORY"]
+ print(f"Found: {len(dirsFound)} directories")
+ return dirsFound, "Not empty"
+ except KeyError:
+ print(f"Appears that {path} is empty... Logging.")
+ return [], "Empty"
+ else:
+ print(response.text)
+ return "Failed", "Failed"
+
+def _make_E2_empty_directory(E2URL, E2TOKEN, path):
+ print(f"Making an empty directory at {path} in E2...")
+ requestsURL = E2URL + "/api/2.0/workspace/mkdirs"
+ headers = {
+ 'Authorization': f'Bearer {E2TOKEN}'
+ }
+ payload = {"path": path}
+ print(requestsURL, payload)
+ response = requests.request("POST", requestsURL, headers=headers, data=payload)
+
+ if response.status_code == 200:
+ print(f"Successfully created empty directory at {path} in E2...")
+ return "Success"
+ else:
+ print(response.text)
+ return "Failed"
+
+
+def _run_test_if_empty(ST, STTOKEN, E2, E2TOKEN, pathsToCheck, pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus):
+ next_level_dirs = []
+
+ for newPath in pathsToCheck:
+ newDirs, status = _get_workspace_list(ST, STTOKEN, newPath)
+ pathsChecked.append(newPath)
+ pathsStatus.append(status)
+ next_level_dirs.extend(newDirs)
+
+ if status == "Empty":
+ result = _make_E2_empty_directory(E2, E2TOKEN, newPath)
+ pathsCreated.append(newPath)
+ pathsCreatedStatus.append(result)
+
+ if len(next_level_dirs) == 0:
+ test_status = "Done"
+ else:
+ test_status = "Again"
+
+ return pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus, next_level_dirs, test_status
+
+def main(E2, E2TOKEN, ST, STTOKEN, PATH="/"):
+ print("Starting empty workspace creation...")
+ start = time.time()
+
+ if PATH is None:
+ PATH = "/"
+
+ pathsChecked = []
+ pathsStatus = []
+ pathsCreated = []
+ pathsCreatedStatus = []
+
+ dirs, status = _get_workspace_list(ST, STTOKEN, PATH)
+ pathsChecked.append(PATH)
+ pathsStatus.append(status)
+
+ while True:
+ pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus, dirs, test_status = _run_test_if_empty(ST, STTOKEN, E2, E2TOKEN, dirs, pathsChecked, pathsStatus, pathsCreated, pathsCreatedStatus)
+
+ if test_status == "Done":
+ print("Should end now...")
+ break
+
+ modelDict = {
+ 'paths': pathsChecked,
+ 'empty_or_not': pathsStatus,
+ }
+
+ print("Logging the paths checked...")
+ df = pd.DataFrame.from_dict(modelDict)
+ df.to_csv("paths_checked.csv")
+ print("Saved paths checked to paths_checked.csv")
+
+ modelDict = {
+ 'paths': pathsCreated,
+ 'empty_or_not': pathsCreatedStatus,
+ }
+
+ print("Logging the paths created...")
+ df = pd.DataFrame.from_dict(modelDict)
+ df.to_csv("paths_created.csv")
+ print("Saved paths created to paths_created.csv")
+
+ end = time.time()
+ print("...Finished")
+ execution_time = end - start
+ print(f"Time script took: {timedelta(seconds=execution_time)}")
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Move sample jobs for an E2 Migration.")
+ parser.add_argument("--E2workspace", "--E2", dest="E2", help="URL to the E2 workspace")
+ parser.add_argument("--E2token", dest="E2TOKEN", help="E2 token for access.")
+ parser.add_argument("--STworkspace", "--ST", dest="ST", help="URL to the ST workspace")
+ parser.add_argument("--STtoken", dest="STTOKEN", help="ST token for access.")
+ parser.add_argument("--PATH", dest="PATH", help="Starting path, defaults to '/'. Will work recursively from there.")
+ parser = parser.parse_args()
+ main(parser.ST, parser.E2, parser.STTOKEN, parser.E2TOKEN, parser.PATH)
\ No newline at end of file
diff --git a/dbclient/ClustersClient.py b/dbclient/ClustersClient.py
index 5619ec2..0b0ac89 100644
--- a/dbclient/ClustersClient.py
+++ b/dbclient/ClustersClient.py
@@ -1,10 +1,12 @@
import logging
import os
+import csv
import re
import time
import logging_utils
import wmconstants
from dbclient import *
+from .ScimClient import ScimClient
class ClustersClient(dbclient):
@@ -12,6 +14,7 @@ def __init__(self, configs, checkpoint_service):
super().__init__(configs)
self._checkpoint_service = checkpoint_service
self.groups_to_keep = configs.get("groups_to_keep", False)
+ self.scim_client = ScimClient(configs, checkpoint_service)
self.skip_missing_users = configs['skip_missing_users']
self.hipaa = configs.get('hipaa', False)
self.bypass_secret_acl = configs.get('bypass_secret_acl', False)
@@ -98,7 +101,7 @@ def cleanup_cluster_pool_configs(self, cluster_json, cluster_creator, is_job_clu
if 'aws_attributes' in cluster_json:
aws_conf = cluster_json.pop('aws_attributes')
iam_role = aws_conf.get('instance_profile_arn', None)
- if not iam_role:
+ if iam_role:
cluster_json['aws_attributes'] = {'instance_profile_arn': iam_role}
return cluster_json
@@ -257,7 +260,19 @@ def get_new_policy_id_dict(self, policy_file='cluster_policies.log'):
policy_id_dict[old_policy_id] = current_policies_dict[policy_name] # old_id : new_id
return policy_id_dict
- def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None):
+ def nitro_instance_mapping(self, instance_type_id):
+ dict_from_csv = {}
+ real_path = os.path.dirname(os.path.realpath(__file__))
+ csv_file = f'{real_path}/../data/nitro_mapping.csv'
+ with open(csv_file, newline='', mode='r') as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type']
+
+ nitro_instance_type_id = dict_from_csv[instance_type_id]
+ return nitro_instance_type_id
+
+ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clusters.log', filter_user=None, nitro=False):
"""
Import cluster configs and update appropriate properties / tags in the new env
:param log_file:
@@ -301,6 +316,12 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus
else:
cluster_conf['custom_tags'] = {'OriginalCreator': cluster_creator}
new_cluster_conf = cluster_conf
+ if nitro:
+ if 'node_type_id' in new_cluster_conf:
+ new_cluster_conf['node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['node_type_id'])
+ if 'driver_node_type_id' in new_cluster_conf:
+ new_cluster_conf['driver_node_type_id'] = self.nitro_instance_mapping(new_cluster_conf['driver_node_type_id'])
+
print("Creating cluster: {0}".format(new_cluster_conf['cluster_name']))
cluster_resp = self.post('/clusters/create', new_cluster_conf)
if cluster_resp['http_status_code'] == 200:
@@ -310,6 +331,9 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus
if 'cluster_id' in cluster_conf:
checkpoint_cluster_configs_set.write(cluster_conf['cluster_id'])
else:
+ cluster_resp['old_cluster_id'] = cluster_conf['cluster_id']
+ cluster_resp['old_cluster_name'] = cluster_conf['cluster_name']
+
logging_utils.log_response_error(error_logger, cluster_resp)
print(cluster_resp)
@@ -338,14 +362,38 @@ def import_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_clus
ignore_error_list = ["RESOURCE_DOES_NOT_EXIST", "RESOURCE_ALREADY_EXISTS"]
else:
ignore_error_list = ["RESOURCE_ALREADY_EXISTS"]
-
+
if logging_utils.check_error(resp, ignore_error_list):
+ if resp['error_code'] == 'RESOURCE_DOES_NOT_EXIST':
+ resp = self.remove_missing_users(api, acl_args, resp)
+ if not logging_utils.log_response_error(error_logger, resp):
+ if 'object_id' in data:
+ checkpoint_cluster_configs_set.write(data['object_id'])
+ else:
+ logging_utils.log_response_error(error_logger, resp)
logging_utils.log_response_error(error_logger, resp)
elif 'object_id' in data:
checkpoint_cluster_configs_set.write(data['object_id'])
print(resp)
+ def remove_missing_users(self, api, acl_args, resp):
+ # example message: 'Principal: UserName(x.x@email.com) does not exist'
+ # or 'Principal: GroupName(x.x) does not exist'
+ resp = self.put(api, acl_args)
+ while resp.get('error_code', '') == 'RESOURCE_DOES_NOT_EXIST':
+ if 'UserName' in resp['message']:
+ missing_user = re.search(r'Principal: UserName\((.*)\) does not exist', resp['message']).group(1)
+ logging.info(f"Removing missing user {missing_user} from ACL")
+ acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('user_name', None) != missing_user]
+ resp = self.put(api, acl_args)
+ elif 'GroupName' in resp['message']:
+ missing_group = re.search(r'Principal: GroupName\((.*)\) does not exist', resp['message']).group(1)
+ logging.info(f"Removing missing group {missing_group} from ACL")
+ acl_args['access_control_list'] = [acl for acl in acl_args['access_control_list'] if acl.get('group_name', None) != missing_group]
+ resp = self.put(api, acl_args)
+ return resp
+
def _log_cluster_ids_and_original_creators(
self,
cluster_log_file,
@@ -570,8 +618,9 @@ def log_cluster_configs(self, log_file='clusters.log', acl_log_file='acl_cluster
# get users list based on groups_to_keep
users_list = []
- if self.groups_to_keep is not None:
- all_users = self.get('/preview/scim/v2/Users').get('Resources', None)
+ if self.groups_to_keep is not False:
+ # all_users = self.get('/preview/scim/v2/Users').get('Resources', None)
+ all_users = self.scim_client.get_active_users()
users_list = list(set([user.get("emails")[0].get("value") for user in all_users
for group in user.get("groups") if group.get("display") in self.groups_to_keep]))
@@ -646,8 +695,9 @@ def log_cluster_policies(self, log_file='cluster_policies.log', acl_log_file='ac
# get users list based on groups_to_keep
users_list = []
- if self.groups_to_keep is not None:
- all_users = self.get('/preview/scim/v2/Users').get('Resources', None)
+ if self.groups_to_keep is not False:
+ # all_users = self.get('/preview/scim/v2/Users').get('Resources', None)
+ all_users = self.scim_client.get_active_users()
users_list = list(set([user.get("emails")[0].get("value") for user in all_users
for group in user.get("groups") if
group.get("display") in self.groups_to_keep]))
diff --git a/dbclient/HiveClient.py b/dbclient/HiveClient.py
index 3980b7f..ba5d582 100644
--- a/dbclient/HiveClient.py
+++ b/dbclient/HiveClient.py
@@ -274,7 +274,7 @@ def export_database(self, db_name, cluster_name=None, iam_role=None, metastore_d
success_metastore_log_path, current_iam, checkpoint_metastore_set, has_unicode)
def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', db_log='database_details.log',
- success_log='success_metastore.log', has_unicode=False):
+ success_log='success_metastore.log', has_unicode=False, database=None):
start = timer()
checkpoint_metastore_set = self._checkpoint_service.get_checkpoint_key_set(
wmconstants.WM_EXPORT, wmconstants.METASTORE_TABLES)
@@ -300,7 +300,10 @@ def export_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', d
database_logfile = self.get_export_dir() + db_log
if os.path.exists(success_metastore_log_path):
os.remove(success_metastore_log_path)
- all_dbs = self.get_all_databases(error_logger, cid, ec_id)
+ if database:
+ all_dbs = database
+ else:
+ all_dbs = self.get_all_databases(error_logger, cid, ec_id)
resp = self.set_desc_database_helper(cid, ec_id)
if self.is_verbose():
logging.info(resp)
@@ -407,6 +410,7 @@ def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', v
if not self.move_table_view(db_name, tbl_name, local_table_ddl):
# we hit a table ddl here, so we apply the ddl
resp = self.apply_table_ddl(local_table_ddl, ec_id, cid, db_path, has_unicode)
+ resp['table'] = db_name + "." + tbl_name
if not logging_utils.log_response_error(error_logger, resp):
checkpoint_metastore_set.write(full_table_name)
else:
@@ -439,6 +443,7 @@ def import_hive_metastore(self, cluster_name=None, metastore_dir='metastore/', v
db_name, view_name = unpack_view_db_name(full_view_name)
local_view_ddl = metastore_view_dir + db_name + '/' + view_name
resp = self.apply_table_ddl(local_view_ddl, ec_id, cid, db_path, has_unicode)
+ resp['view'] = full_view_name
if not logging_utils.log_response_error(error_logger, resp):
checkpoint_metastore_set.write(full_view_name)
logging.info(resp)
diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py
index 0d4dace..41effcb 100644
--- a/dbclient/JobsClient.py
+++ b/dbclient/JobsClient.py
@@ -1,4 +1,5 @@
import json
+import csv
import os
import logging
import logging_utils
@@ -69,7 +70,7 @@ def update_imported_job_names(self, error_logger, checkpoint_job_configs_set):
job_name = job['settings']['name']
# job name was set to `old_job_name:::{job_id}` to support duplicate job names
# we need to parse the old job name and update the current jobs
- if checkpoint_job_configs_set.contains(job_name):
+ if checkpoint_job_configs_set.contains(job_name) or (':::' not in job_name):
continue
old_job_name = job_name.split(':::')[0]
new_settings = {'name': old_job_name}
@@ -81,7 +82,7 @@ def update_imported_job_names(self, error_logger, checkpoint_job_configs_set):
else:
raise RuntimeError("Import job has failed. Refer to the previous log messages to investigate.")
- def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.log', acl_file='acl_jobs.log'):
+ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.log', acl_file='acl_jobs.log', default_job_owner=False):
"""
log all job configs and the ACLs for each job
:param users_list: a list of users / emails to filter the results upon (optional for group exports)
@@ -133,6 +134,15 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo
for permission in acl.get("all_permissions"):
if permission.get("permission_level") == "IS_OWNER":
valid_acl = True
+ if not valid_acl and default_job_owner:
+ default_owner_permission = {"user_name": default_job_owner, "all_permissions": [{"permission_level": "IS_OWNER", "inherited": False}]}
+ acls.append(default_owner_permission)
+ # re check if ACL is valid
+ for acl in acls:
+ for permission in acl.get("all_permissions"):
+ if permission.get("permission_level") == "IS_OWNER":
+ valid_acl = True
+
if valid_acl:
# job and job_acl are fine, writing both to the output files
log_fp.write(json.dumps(x) + '\n')
@@ -150,7 +160,19 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo
'error': message, 'json': json.dumps(x)
})
- def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_map_file='job_id_map.log'):
+ def nitro_instance_mapping(self, instance_type_id):
+ dict_from_csv = {}
+ real_path = os.path.dirname(os.path.realpath(__file__))
+ csv_file = f'{real_path}/../data/nitro_mapping.csv'
+ with open(csv_file, newline='', mode='r') as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ dict_from_csv[row['PVC Instance Type']] = row['Recommended Nitro Instance Type']
+
+ nitro_instance_id = dict_from_csv[instance_type_id]
+ return nitro_instance_id
+
+ def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_map_file='job_id_map.log', nitro=False):
jobs_log = self.get_export_dir() + log_file
acl_jobs_log = self.get_export_dir() + acl_file
job_map_log = self.get_export_dir() + job_map_file
@@ -233,7 +255,13 @@ def adjust_ids_for_cluster(settings): #job_settings or task_settings
mod_task_settings.append(adjust_ids_for_cluster(task_settings))
if len(mod_task_settings) > 0:
job_settings['tasks'] = mod_task_settings
-
+ if nitro:
+ if 'new_cluster' in job_settings:
+ if 'node_type_id' in job_settings['new_cluster']:
+ job_settings['new_cluster']['node_type_id'] = self.nitro_instance_mapping(job_settings['new_cluster']['node_type_id'])
+ if 'driver_node_type_id' in job_settings['new_cluster']:
+ job_settings['new_cluster']['driver_node_type_id'] = self.nitro_instance_mapping(job_settings['new_cluster']['driver_node_type_id'])
+ logging.info(job_settings)
logging.info("Current Job Name: {0}".format(job_conf['settings']['name']))
# creator can be none if the user is no longer in the org. see our docs page
create_resp = self.post('/jobs/create', job_settings)
diff --git a/dbclient/ScimClient.py b/dbclient/ScimClient.py
index 96f856e..6277020 100644
--- a/dbclient/ScimClient.py
+++ b/dbclient/ScimClient.py
@@ -7,40 +7,70 @@
import concurrent
from concurrent.futures import ThreadPoolExecutor
from threading_utils import propagate_exceptions
+import concurrent.futures
class ScimClient(dbclient):
def __init__(self, configs, checkpoint_service):
super().__init__(configs)
self._checkpoint_service = checkpoint_service
self.groups_to_keep = configs.get("groups_to_keep", False)
-
- def get_active_users(self):
- users = self.get('/preview/scim/v2/Users').get('Resources', None)
- return users if users else None
+ self.users_list = self.get_users_full_from_log()
+
+
+ def fetch_page(self, start, count):
+ endpoint = f'/preview/scim/v2/Users?startIndex={start}&count={count}'
+ response = self.get(endpoint)
+ return response.get('Resources', [])
+
+ def get_active_users(self, results=None):
+
+ if self._use_logs and self.users_list is None:
+ results = self.get_users_full_from_log()
+ elif self._use_logs:
+ results = self.users_list
+
+ if results is None:
+ page_size = 10
+ first_response = self.get(f'/preview/scim/v2/Users?startIndex=1&count=1')
+ total = first_response.get('totalResults', 0)
+ if total == 0:
+ return None
+
+ indices = range(1, total + 1, page_size)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+ futures = [executor.submit(self.fetch_page, i, page_size) for i in indices]
+ results = []
+ for future in concurrent.futures.as_completed(futures):
+ results.extend(future.result())
+
+ return results or None
def log_all_users(self, log_file='users.log'):
user_log = self.get_export_dir() + log_file
- users = self.get('/preview/scim/v2/Users').get('Resources', None)
- if users:
+
+ all_users = self.get_active_users()
+
+ if all_users:
with open(user_log, "w", encoding="utf-8") as fp:
- for x in users:
+ for x in all_users:
fullname = x.get('name', None)
- # if a group list has been passed, check to see if current user is part of groups
if self.groups_to_keep:
- user_groups = [g['display'] for g in x.get('groups')]
+ user_groups = [g['display'] for g in x.get('groups', [])]
if not set(user_groups).intersection(set(self.groups_to_keep)):
continue
if fullname:
given_name = fullname.get('givenName', None)
- # if user is an admin, skip this user entry
if x['userName'] == 'admin' and given_name == 'Administrator':
continue
+
fp.write(json.dumps(x) + '\n')
else:
logging.info("Users returned an empty object")
+
def log_single_user(self, user_email, log_file='single_user.log'):
single_user_log = self.get_export_dir() + log_file
users = self.get_active_users()
@@ -69,12 +99,38 @@ def get_users_from_log(self, users_log='users.log'):
:return: a list of usernames that help identify their workspace paths
"""
user_logfile = self.get_export_dir() + users_log
+
username_list = []
- with open(user_logfile, 'r', encoding="utf-8") as fp:
- for u in fp:
+ if self.users_list is None:
+ with open(user_logfile, 'r', encoding="utf-8") as fp:
+ for u in fp:
+ user_json = json.loads(u)
+ username_list.append(user_json.get('userName'))
+ else:
+ for u in self.users_list:
user_json = json.loads(u)
- username_list.append(user_json.get('userName'))
+ username_list.append(user_json.get('userName'))
+
return username_list
+
+
+ def get_users_full_from_log(self, users_log='users.log'):
+ """
+ fetch a list of user names from the users log file
+ meant to be used during group exports where the user list is a subset of users
+ :param users_log:
+ :return: a list of usernames that help identify their workspace paths
+ """
+ user_logfile = self.get_export_dir() + users_log
+ if os.path.isfile(user_logfile):
+ username_list = []
+ with open(user_logfile, 'r', encoding="utf-8") as fp:
+ for u in fp:
+ user_json = json.loads(u)
+ username_list.append(user_json)
+ return username_list
+ else:
+ return None
@staticmethod
def is_member_a_user(member_json):
@@ -98,10 +154,29 @@ def add_username_to_group(self, group_json):
# add the userName field to json since ids across environments may not match
members = group_json.get('members', [])
new_members = []
+ users_list = None
+ # try:
+ users_list = self.get_users_full_from_log() if self.users_list is None else self.users_list
+ # except Exception as e:
+ # logging.info(e)
+
for m in members:
m_id = m['value']
if self.is_member_a_user(m):
- user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id))
+ user_resp = None
+ if users_list:
+ # print(users_list[0]['id'].__class__)
+ # user_resp = next((item for item in users_list if item['id'] == m_id), None)
+ for u in users_list:
+ if str(u['id']) == str(m_id):
+ user_resp = u
+ break
+ # user_resp = next(filter(lambda x: x.get("id") == m_id, users_list), None)
+
+ if user_resp is None:
+ user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id))
+ else:
+ user_resp = self.get('/preview/scim/v2/Users/{0}'.format(m_id))
m['userName'] = user_resp['userName']
m['type'] = 'user'
elif self.is_member_a_group(m):
@@ -113,12 +188,36 @@ def add_username_to_group(self, group_json):
new_members.append(m)
group_json['members'] = new_members
return group_json
+
+ def fetch_group_page(self, start, count):
+ endpoint = f'/preview/scim/v2/Groups?startIndex={start}&count={count}'
+ response = self.get(endpoint)
+ return response.get('Resources', [])
+
+ def get_active_groups(self):
+ page_size = 10
+ first_response = self.get(f'/preview/scim/v2/Groups?startIndex=1&count=1')
+ total = first_response.get('totalResults', 0)
+
+ if total == 0:
+ return None
+
+ indices = range(1, total + 1, page_size)
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+ futures = [executor.submit(self.fetch_group_page, i, page_size) for i in indices]
+ results = []
+ for future in concurrent.futures.as_completed(futures):
+ results.extend(future.result())
+ return results or None
+
def log_all_groups(self, group_log_dir='groups/'):
group_dir = self.get_export_dir() + group_log_dir
os.makedirs(group_dir, exist_ok=True)
- group_list = self.get("/preview/scim/v2/Groups").get('Resources', [])
+ group_list = self.get_active_groups()
for x in group_list:
+ logging.info(f"group: {x}")
group_name = x['displayName']
# if groups_to_keep is defined, check to see if current group is a member
@@ -126,7 +225,7 @@ def log_all_groups(self, group_log_dir='groups/'):
if group_name not in self.groups_to_keep:
continue
- with open(group_dir + group_name, "w", encoding="utf-8") as fp:
+ with open(group_dir + group_name.replace("/", "_"), "w", encoding="utf-8") as fp:
fp.write(json.dumps(self.add_username_to_group(x)))
@staticmethod
@@ -146,7 +245,8 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l
"""
group_dir = self.get_export_dir() + group_log_dir
os.makedirs(group_dir, exist_ok=True)
- group_list = self.get("/preview/scim/v2/Groups").get('Resources', [])
+ # group_list = self.get("/preview/scim/v2/Groups").get('Resources', [])
+ group_list = self.get_active_groups()
group_dict = self.build_group_dict(group_list)
member_id_list = []
for group_name in group_name_list:
@@ -158,7 +258,7 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l
sub_group_names = list(map(lambda z: z.get('display'), filtered_sub_groups))
group_name_list.extend(sub_group_names)
member_id_list.extend(list(map(lambda y: y['value'], filtered_users)))
- with open(group_dir + group_name, "w", encoding="utf-8") as fp:
+ with open(group_dir + group_name.replace("/", "_"), "w", encoding="utf-8") as fp:
group_details.pop('roles', None) # removing the roles field from the groups arg
fp.write(json.dumps(self.add_username_to_group(group_details)))
users_log = self.get_export_dir() + users_logfile
@@ -176,7 +276,8 @@ def log_groups_from_list(self, group_name_list, group_log_dir='groups/', users_l
def get_user_id_mapping(self):
# return a dict of the userName to id mapping of the new env
- user_list = self.get('/preview/scim/v2/Users').get('Resources', None)
+ # user_list = self.get('/preview/scim/v2/Users').get('Resources', None)
+ user_list = self.get_active_users()
if user_list:
user_id_dict = {}
for user in user_list:
@@ -222,7 +323,7 @@ def assign_group_entitlements(self, group_dir, error_logger):
return
groups = self.listdir(group_dir)
for group_name in groups:
- with open(group_dir + group_name, 'r', encoding="utf-8") as fp:
+ with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp:
group_data = json.loads(fp.read())
entitlements = group_data.get('entitlements', None)
if entitlements:
@@ -239,7 +340,7 @@ def assign_group_roles(self, group_dir, error_logger):
return
groups = self.listdir(group_dir)
for group_name in groups:
- with open(group_dir + group_name, 'r', encoding="utf-8") as fp:
+ with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp:
group_data = json.loads(fp.read())
roles = group_data.get('roles', None)
if roles:
@@ -418,7 +519,7 @@ def import_groups(self, group_dir, current_user_ids, error_logger):
# dict of { old_user_id : email }
old_user_emails = self.get_old_user_emails()
for group_name in groups:
- with open(group_dir + group_name, 'r', encoding="utf-8") as fp:
+ with open(group_dir + group_name.replace("/", "_"), 'r', encoding="utf-8") as fp:
members = json.loads(fp.read()).get('members', None)
logging.info(f"Importing group {group_name} :")
if members:
diff --git a/dbclient/WorkspaceClient.py b/dbclient/WorkspaceClient.py
index ffd61cc..e792768 100644
--- a/dbclient/WorkspaceClient.py
+++ b/dbclient/WorkspaceClient.py
@@ -29,7 +29,7 @@ def __init__(self, configs, checkpoint_service):
self._checkpoint_service = checkpoint_service
self.groups_to_keep = configs.get("groups_to_keep", False)
self.skip_missing_users = configs['skip_missing_users']
- self.skip_large_nb = configs['skip_large_nb']
+ self.scim_client = ScimClient(configs, checkpoint_service)
_languages = {'.py': 'PYTHON',
'.scala': 'SCALA',
@@ -351,11 +351,8 @@ def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error
logging_utils.log_response_error(error_logger, resp)
return resp
if resp.get('error_code', None):
- if self.skip_large_nb and resp.get("message", None) == 'Size exceeds 10485760 bytes':
- logging.info("Notebook {} skipped due to size exceeding limit".format(notebook_path))
- else:
- resp['path'] = notebook_path
- logging_utils.log_response_error(error_logger, resp)
+ resp['path'] = notebook_path
+ logging_utils.log_response_error(error_logger, resp)
return resp
nb_path = os.path.dirname(notebook_path)
if nb_path != '/':
@@ -373,7 +370,7 @@ def download_notebook_helper(self, notebook_data, checkpoint_notebook_set, error
logging.warning(f"Notebook file {save_filename} already exists; please rename in source workspace. "
f"Note that files are case-insensitive")
return {}
-
+ logging.info(save_filename)
with open(save_filename, "wb") as f:
f.write(base64.b64decode(resp['content']))
checkpoint_notebook_set.write(notebook_path)
@@ -417,7 +414,7 @@ def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_wo
workspace_log_writer = ThreadSafeWriter(self.get_export_dir() + workspace_log_file, "a")
libs_log_writer = ThreadSafeWriter(self.get_export_dir() + libs_log_file, "a")
dir_log_writer = ThreadSafeWriter(self.get_export_dir() + dir_log_file, "a")
- repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a")
+ #repos_log_writer = ThreadSafeWriter(self.get_export_dir() + repos_log_file, "a")
checkpoint_item_log_set = self._checkpoint_service.get_checkpoint_key_set(
wmconstants.WM_EXPORT, wmconstants.WORKSPACE_ITEM_LOG_OBJECT
)
@@ -426,14 +423,14 @@ def log_all_workspace_items_entry(self, ws_path='/', workspace_log_file='user_wo
workspace_log_writer=workspace_log_writer,
libs_log_writer=libs_log_writer,
dir_log_writer=dir_log_writer,
- repos_log_writer=repos_log_writer,
+ repos_log_writer=None,
checkpoint_set=checkpoint_item_log_set,
exclude_prefixes=exclude_prefixes)
finally:
workspace_log_writer.close()
libs_log_writer.close()
dir_log_writer.close()
- repos_log_writer.close()
+ #repos_log_writer.close()
return num_nbs
@@ -456,7 +453,7 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer
if not os.path.exists(self.get_export_dir()):
os.makedirs(self.get_export_dir(), exist_ok=True)
items = self.get(WS_LIST, get_args).get('objects', None)
- repos = self.get(REPOS).get('repos', None)
+ #repos = self.get(REPOS).get('repos', None)
num_nbs = 0
if self.is_verbose():
logging.info("Listing: {0}".format(get_args['path']))
@@ -467,7 +464,8 @@ def log_all_workspace_items(self, ws_path, workspace_log_writer, libs_log_writer
notebooks = self.filter_workspace_items(items, 'NOTEBOOK')
libraries = self.filter_workspace_items(items, 'LIBRARY')
# only get user list if we are filtering by group
- ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else []
+ # ws_users = self.get('/preview/scim/v2/Users').get('Resources', None) if self.groups_to_keep else []
+ ws_users = self.scim_client.get_active_users() if self.groups_to_keep else []
for x in notebooks:
# notebook objects has path and object_id
nb_path = x.get('path')
@@ -535,12 +533,13 @@ def _recurse_log_all_workspace_items(folder):
if num_nbs_plus:
num_nbs += num_nbs_plus
# log all repos
- if repos_log_writer and repos:
- for repo in repos:
- repo_path = repo.get('path', "")
- if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)):
- repos_log_writer.write(json.dumps(repo) + '\n')
- checkpoint_set.write(repo_path)
+
+ # if repos_log_writer and repos:
+ # for repo in repos:
+ # repo_path = repo.get('path', "")
+ # if not checkpoint_set.contains(repo_path) and not repo_path.startswith(tuple(exclude_prefixes)):
+ # repos_log_writer.write(json.dumps(repo) + '\n')
+ # checkpoint_set.write(repo_path)
return num_nbs
diff --git a/dbclient/WorkspaceClient.py.zip b/dbclient/WorkspaceClient.py.zip
new file mode 100644
index 0000000..7ed2243
Binary files /dev/null and b/dbclient/WorkspaceClient.py.zip differ
diff --git a/dbclient/dbclient.py b/dbclient/dbclient.py
index b9f5995..dfca418 100644
--- a/dbclient/dbclient.py
+++ b/dbclient/dbclient.py
@@ -56,6 +56,7 @@ def __init__(self, configs):
self._url = url_validation(configs['url'])
self._update_token(configs['token'])
self._export_dir = configs['export_dir']
+ self._use_logs = configs['use_logs']
self._is_aws = configs['is_aws']
self._is_azure = configs['is_azure']
self._is_gcp = configs['is_gcp']
@@ -422,10 +423,12 @@ def update_email_addresses(self, old_email_address, new_email_address):
:return:
"""
log_dir = self.get_export_dir()
- logs_to_update = ['users.log',
+ logs_to_update = ['users.log',
+ 'jobs.log',
'acl_jobs.log',
'acl_clusters.log', 'acl_cluster_policies.log',
- 'acl_notebooks.log', 'acl_directories.log']
+ 'acl_notebooks.log', 'acl_directories.log',
+ 'secret_scopes_acls.log']
for logfile in logs_to_update:
if os.path.exists(log_dir + logfile):
self.replace_file_contents(old_email_address, new_email_address, logfile)
diff --git a/dbclient/parser.py b/dbclient/parser.py
index e6ad312..03dc8a7 100644
--- a/dbclient/parser.py
+++ b/dbclient/parser.py
@@ -141,7 +141,8 @@ def get_export_parser():
parser.add_argument('--database', action='store',
help='Database name to export for the metastore and table ACLs. Single database name supported')
- # iam role used to export the metastore
+ # iam role used to export the
+
parser.add_argument('--iam', action='store',
help='IAM Instance Profile to export metastore entires')
@@ -191,6 +192,9 @@ def get_export_parser():
parser.add_argument('--set-export-dir', action='store',
help='Set the base directory to export artifacts')
+
+ parser.add_argument('--use-logs', action='store_true',
+ help='Set flag to use export logs if they exists')
parser.add_argument('--pause-all-jobs', action='store_true',
help='Pause all scheduled jobs')
@@ -453,6 +457,7 @@ def build_client_config(profile, url, token, args):
config['export_dir'] = 'gcp_logs/'
config['use_checkpoint'] = args.use_checkpoint
+ config['use_logs'] = args.use_logs
config['num_parallel'] = args.num_parallel
config['retry_total'] = args.retry_total
config['retry_backoff'] = args.retry_backoff
@@ -489,6 +494,9 @@ def get_pipeline_parser() -> argparse.ArgumentParser:
parser.add_argument('--set-export-dir', action='store',
help='Set the base directory to export artifacts')
+
+ parser.add_argument('--use-logs', action='store_true',
+ help='Set flag to use export logs if they exists')
parser.add_argument('--cluster-name', action='store', required=False,
help='Cluster name to export the metastore to a specific cluster. Cluster will be started.')
@@ -503,7 +511,15 @@ def get_pipeline_parser() -> argparse.ArgumentParser:
parser.add_argument('--archive-missing', action='store_true',
help='Import all missing users into the top level /Archive/ directory.')
+ # Cluster + Job arguments
+ parser.add_argument('--nitro', action='store_true',
+ help='Set to use Nitro cluster types for all clusters and jobs.')
+
+ # Jobs arguments
+ parser.add_argument('--default-job-owner', action='store', default=False,
+ help='Set a default job owner for jobs without an owner.')
+
# Metastore arguments
parser.add_argument('--repair-metastore-tables', action='store_true', default=False,
help='Repair legacy metastore tables')
@@ -517,6 +533,9 @@ def get_pipeline_parser() -> argparse.ArgumentParser:
parser.add_argument('--skip-missing-users', action='store_true', default=False,
help='Skip missing principles during import.')
+ parser.add_argument('--database', nargs="+", action='store', default=[],
+ help='list of databases to selectively export')
+
# Pipeline arguments
parser.add_argument('--session', action='store', default='',
help='If set, pipeline resumes from latest checkpoint of given session; '
diff --git a/export_db.py b/export_db.py
index 8a0d12e..1bf3d37 100644
--- a/export_db.py
+++ b/export_db.py
@@ -227,7 +227,12 @@ def main():
start = timer()
client = dbclient(client_config)
#parse list list of e-mail mapping pairs. Format is: old1@email.com:new1@e-mail.com,old2email.com:new2@email.com
- emailpairs = args.replace_email.split(',')
+ if args.replace_email == "ALL_LOWERCASE":
+ scim_c = ScimClient(client_config, checkpoint_service)
+ old_emails = scim_c.get_users_from_log()
+ emailpairs = [old_email + ":" + old_email.lower() for old_email in old_emails]
+ else:
+ emailpairs = args.replace_email.split(',')
print(str(len(emailpairs)) +' emails found to replace')
for emailpair in emailpairs:
if len(emailpair.split(':')) < 2:
diff --git a/library_migration.py b/library_migration.py
new file mode 100644
index 0000000..2941c1d
--- /dev/null
+++ b/library_migration.py
@@ -0,0 +1,168 @@
+import json
+import argparse
+import requests
+from datetime import datetime
+import configparser
+import re
+import os
+from os import path
+
+class dbclient:
+ def __init__(self, profile):
+ login = self.get_login_credentials(profile)
+ url = login['host']
+ token = login['token']
+ self.url = self.url_validation(url)
+ self.token = token
+
+ def url_validation(self, url):
+ if '/?o=' in url:
+ # if the workspace_id exists, lets remove it from the URL
+ url = re.sub("\/\?o=.*", '', url)
+ elif 'net/' == url[-4:]:
+ url = url[:-1]
+ elif 'com/' == url[-4:]:
+ url = url[:-1]
+ return url.rstrip("/")
+
+ def get_login_credentials(self, profile='DEFAULT'):
+ creds_path = '~/.databrickscfg'
+ config = configparser.ConfigParser()
+ abs_creds_path = path.expanduser(creds_path)
+ config.read(abs_creds_path)
+ try:
+ current_profile = dict(config[profile])
+ if not current_profile:
+ raise ValueError(f"Unable to find a defined profile to run this tool. Profile \'{profile}\' not found.")
+ return current_profile
+ except KeyError:
+ raise ValueError(
+ 'Unable to find credentials to load for profile. Profile only supports tokens.')
+
+ def get_url_token(self):
+ return self.url, self.token
+
+def get_libraries_cluster(token, workspace_url, cluster_id):
+ url = f"{workspace_url}/api/2.0/libraries/cluster-status"
+ print(f"{datetime.now()} Endpoint: {url}")
+ print(f"{datetime.now()} Getting list of libraries from clusters... ")
+ st_response = requests.get(url, headers = {"Authentication": f"Bearer {token}"}, json = {"cluster_id": cluster_id})
+
+ if st_response.status_code != 200:
+ print(f"{datetime.now()} ERROR. ")
+ print(st_response.content)
+ return ''
+ else:
+ st_statuses = st_response.json()
+ return st_statuses
+
+def get_cluster_name(token, workspace_url):
+ url = f"{workspace_url}/api/2.0/clusters/list"
+ print(f"{datetime.now()} Endpoint: {url}")
+ print(f"{datetime.now()} Getting list of clusters from {workspace_url}... ")
+
+ response = requests.get(url, headers = {"Authentication": f"Bearer {token}"})
+
+ if response.status_code != 200:
+ print(f"{datetime.now()} ERROR. ")
+ raise Exception(response.content)
+ else:
+ return response.json()
+
+# Find ST cluster_name from the ST cluster_id
+def find_cluster_name(cluster_id, json_list):
+ for i in json_list:
+ if cluster_id == i['cluster_id']:
+ return i['cluster_name']
+ return ''
+# Find E2 cluster id using the cluster_name
+def find_cluster_id(cluster_name, json_list):
+ for i in json_list:
+ if cluster_name == i['cluster_name']:
+ return i['cluster_id']
+ return ''
+
+def export_pipeline(old_profile, new_profile):
+ old_dbclient = dbclient(profile=old_profile)
+ old_url, old_token = old_dbclient.get_url_token()
+
+ st_clusters = get_cluster_name(old_token, old_url)
+
+ new_dbclient = dbclient(profile=new_profile)
+ new_url, new_token = new_dbclient.get_url_token()
+
+ e2_clusters = get_cluster_name(new_token, new_url)
+
+ st_clusters['clusters'] = [i for i in st_clusters['clusters'] if 'JOB' not in i['cluster_source']]
+ e2_clusters['clusters'] = [i for i in e2_clusters['clusters'] if 'JOB' not in i['cluster_source']]
+
+ st_statuses = []
+ for i in st_clusters['clusters']:
+ st_statuses.append(get_libraries_cluster(old_token, old_url, i['cluster_id']))
+
+ no_libraries = []
+ with_libraries = []
+ for i in st_statuses:
+ try:
+ st_cname = find_cluster_name(i['cluster_id'], st_clusters['clusters'])
+ if st_cname != '':
+ e2_cid = find_cluster_id(st_cname, e2_clusters['clusters'])
+ if e2_cid != '':
+ print(f"{datetime.now()} Creating Cluster ID Mapping... ")
+ print(f"{' '*26} Cluster Name: {st_cname} {i['cluster_id']} -> {e2_cid}")
+ i['cluster_id'] = e2_cid
+ with_libraries.append({
+ 'cluster_id': e2_cid,
+ 'libraries': [j['library'] for j in i['library_statuses']]
+ })
+ else:
+ print(f"{datetime.now()} Error: Cannot find the cluster {st_cname} in new workspace")
+ else:
+ print(f"{datetime.now()} Error: Cannot find the cluster id {i['cluster_id']} in the original workspace")
+ except Exception as e:
+ no_libraries.append(i['cluster_id'])
+
+ return with_libraries, no_libraries
+
+def install_library(token, workspace_url, data):
+ library_install_url = f"{workspace_url}/api/2.0/libraries/install"
+ print(f"{datetime.now()} Endpoint: {library_install_url}")
+ print(f"{datetime.now()} Installing libraries on new clusters... ")
+
+ for i in data:
+ response = requests.post(library_install_url, headers = {"Authentication": f"Bearer {token}"}, json=i)
+
+ if response.status_code == 200:
+ print(f"{datetime.now()} Successfully added libraries for", i['cluster_id'])
+ else:
+ print(f"{datetime.now()} Error: Cannot add libraries for", i['cluster_id'])
+ print(response.content)
+
+def import_pipeline(new_profile, data):
+ new_dbclient = dbclient(profile=new_profile)
+ new_url, new_token = new_dbclient.get_url_token()
+ install_library(new_token, new_url, data)
+ return
+
+
+def main():
+ all_args = argparse.ArgumentParser()
+ all_args.add_argument('--old-profile', dest="old", help="Profile of the old workspace. ")
+ all_args.add_argument('--new-profile', dest="new", help="Profile of the new workspace. ")
+ args = all_args.parse_args()
+
+ old_dbclient = args.old
+ new_dbclient = args.new
+
+ print(f"{datetime.now()} EXPORTING LIBRARIES... ")
+ libraries_data, no_libraries = export_pipeline(old_dbclient, new_dbclient)
+ print()
+ confirm = input(f"Import? (y/N) ")
+ if confirm.lower() in ["y", "yes"]:
+ print(f"{datetime.now()} IMPORTING LIBRARIES... ")
+ import_pipeline(new_dbclient, libraries_data)
+ else:
+ print(f"{datetime.now()} EXITING PIPELINE... ")
+
+if __name__ == "__main__":
+ main()
diff --git a/map.py b/map.py
new file mode 100644
index 0000000..77a89e2
--- /dev/null
+++ b/map.py
@@ -0,0 +1,34 @@
+import argparse
+from utils.create_workspace import Workspace as Workspace
+
+def main():
+ # checkpoints are optional for export but you will need to use them for the import
+ # (each workspace is a 'checkpoint')
+
+ # takes two arguments: checkpoint and workspaces
+ all_args = argparse.ArgumentParser()
+ all_args.add_argument("--checkpoint", dest="checkpoint", default="", help="set if you are using a checkpoint during export")
+ all_args.add_argument("--workspaces", dest="workspaces", nargs="+", required=True, help="list of workspace names. must match columns in asset_mapping.xslx.")
+ all_args.add_argument('--default-job-owner', dest="default_job_owner", default=False, help="set if you want to add a job owner to jobs that drop untagged owners.")
+ all_args.add_argument('--tag', dest="tag", default='Y', help="tag used in asset_mapping.xslx.")
+
+ args = all_args.parse_args()
+
+ checkpoint = args.checkpoint
+ workspaces = args.workspaces
+ default_owner = args.default_job_owner
+ tag = args.tag
+
+ # for each workspace
+ for w in workspaces:
+ # create a workspace Class - refer to create_workspace.py
+ # this instantiates the original location of the session and the new location of the session
+ # it also instantiates another class Split - refer to split_logs.py
+ # Split instantiates the same thing as well as two variables: imported users and imported groups (this is used for remaking ACLs)
+ workspace = Workspace(checkpoint, w, workspaces, default_owner, tag)
+ success = workspace.run()
+
+ workspace.copy_other_files()
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b8883ef
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,38 @@
+# This file may be used to create an environment using:
+# $ conda create --name --file
+# platform: osx-64
+blas=1.0=mkl
+bottleneck=1.3.4=py310h4e76f89_0
+bzip2=1.0.8=h1de35cc_0
+ca-certificates=2022.4.26=hecd8cb5_0
+certifi=2022.6.15=py310hecd8cb5_0
+et_xmlfile=1.1.0=py310hecd8cb5_0
+intel-openmp=2021.4.0=hecd8cb5_3538
+libcxx=12.0.0=h2f01273_0
+libffi=3.3=hb1e8313_2
+mkl=2021.4.0=hecd8cb5_637
+mkl-service=2.4.0=py310hca72f7f_0
+mkl_fft=1.3.1=py310hf879493_0
+mkl_random=1.2.2=py310hc081a56_0
+ncurses=6.3=hca72f7f_2
+numexpr=2.8.1=py310hdcd3fac_2
+numpy=1.22.3=py310hdcd3fac_0
+numpy-base=1.22.3=py310hfd2de13_0
+openpyxl=3.0.9=pyhd3eb1b0_0
+openssl=1.1.1o=hca72f7f_0
+packaging=21.3=pyhd3eb1b0_0
+pandas=1.4.2=py310he9d5cce_0
+pip=21.2.4=py310hecd8cb5_0
+pyparsing=3.0.4=pyhd3eb1b0_0
+python=3.10.4=hdfd78df_0
+python-dateutil=2.8.2=pyhd3eb1b0_0
+pytz=2022.1=py310hecd8cb5_0
+readline=8.1.2=hca72f7f_1
+setuptools=61.2.0=py310hecd8cb5_0
+six=1.16.0=pyhd3eb1b0_1
+sqlite=3.38.5=h707629a_0
+tk=8.6.12=h5d9f67b_0
+tzdata=2022a=hda174b7_0
+wheel=0.37.1=pyhd3eb1b0_0
+xz=5.2.5=hca72f7f_1
+zlib=1.2.12=h4dc903c_2
diff --git a/tasks/tasks.py b/tasks/tasks.py
index 593b10e..5ea2b0d 100644
--- a/tasks/tasks.py
+++ b/tasks/tasks.py
@@ -226,7 +226,7 @@ def __init__(self, client_config, args, checkpoint_service, skip=False):
def run(self):
cl_c = ClustersClient(self.client_config, self.checkpoint_service)
cl_c.import_cluster_policies()
- cl_c.import_cluster_configs()
+ cl_c.import_cluster_configs(nitro=self.args.nitro)
class InstancePoolsImportTask(AbstractTask):
@@ -258,9 +258,9 @@ def run(self):
jobs_c = JobsClient(self.client_config, self.checkpoint_service)
if self.client_config.get("groups_to_keep"):
- jobs_c.log_job_configs(groups_list=self.client_config.get("groups_to_keep"))
+ jobs_c.log_job_configs(groups_list=self.client_config.get("groups_to_keep"), default_job_owner=self.args.default_job_owner)
else:
- jobs_c.log_job_configs()
+ jobs_c.log_job_configs(default_job_owner=self.args.default_job_owner)
class JobsImportTask(AbstractTask):
@@ -277,7 +277,7 @@ def __init__(self, client_config, args, checkpoint_service, skip=False):
def run(self):
jobs_c = JobsClient(self.client_config, self.checkpoint_service)
- jobs_c.import_job_configs()
+ jobs_c.import_job_configs(nitro=self.args.nitro)
class MetastoreExportTask(AbstractTask):
@@ -295,7 +295,8 @@ def __init__(self, client_config, checkpoint_service, args, skip=False):
def run(self):
hive_c = HiveClient(self.client_config, self.checkpoint_service)
hive_c.export_hive_metastore(cluster_name=self.args.cluster_name,
- has_unicode=self.args.metastore_unicode)
+ has_unicode=self.args.metastore_unicode,
+ database=self.args.database,)
class MetastoreImportTask(AbstractTask):
diff --git a/utils/HMS_Modification_Get_Database.py b/utils/HMS_Modification_Get_Database.py
new file mode 100644
index 0000000..a9258e5
--- /dev/null
+++ b/utils/HMS_Modification_Get_Database.py
@@ -0,0 +1,112 @@
+import os
+import argparse
+import json
+
+class MetastoreUpdater:
+
+ def __init__(self, metastore_logs, root_bucket, mount_point, database_details_log):
+ self.metastore_logs = metastore_logs
+ self.root_bucket = root_bucket
+ self.database_details_log = database_details_log
+ if mount_point:
+ self.mount_point = mount_point
+ else:
+ self.mount_point = False
+ self.errors = {}
+ self.updated_ddls = {}
+
+ def duplicate_metastore_as_backup(self):
+ # Get the path up one level from self.metastore_logs
+ backup_dir = os.path.join(os.path.dirname(self.metastore_logs), 'metastore_backup')
+ os.makedirs(backup_dir, exist_ok=True)
+
+ for i in os.listdir(self.metastore_logs):
+ if i not in ['backup', '.DS_Store', '.ipynb_checkpoints']:
+ os.system(f"cp -r {os.path.join(self.metastore_logs, i)} {backup_dir}")
+
+ def get_database_details_log(self):
+ # get the database details log
+ with open(self.database_details_log, 'r') as f:
+ db_details = f.read()
+
+ # split the log by new line
+ db_details = db_details.split('\n')
+
+ # get the database details
+ database_details = {}
+ for db in db_details:
+ try:
+ db = json.loads(db)
+ db_name = db['Namespace Name']
+ db_location = db['Location']
+ database_details[db_name] = db_location
+ except json.decoder.JSONDecodeError:
+ print("Error decoding JSON for database:", db)
+ continue
+
+ return database_details
+
+
+ def update_metastore(self):
+ db_list = [i for i in os.listdir(self.metastore_logs) if i not in ['.DS_Store', '.ipynb_checkpoints']]
+
+ for db in db_list:
+ db_path = os.path.join(self.metastore_logs, db)
+
+ table_list = [i for i in os.listdir(db_path) if i not in ['.DS_Store', '.ipynb_checkpoints']]
+
+ for table in table_list:
+ table_path = os.path.join(db_path, table)
+
+ with open(table_path, 'r') as f:
+ ddl = f.read()
+
+ if "location '" in ddl.lower():
+ self.errors[db + table] = "location found in ddl" + ddl
+ continue
+
+ if "create view" in ddl.lower():
+ self.errors[db + table] = "create view found in ddl" + ddl
+ continue
+
+ if db != 'default':
+ db_details_dict = self.get_database_details_log()
+ if db in db_details_dict:
+ location = db_details_dict[db] + "/" + table
+ else:
+ print(f"ERROR: Database {db} not found in database details log")
+ continue
+
+ new_ddl = ddl + "\nLOCATION '" + location + "'"
+
+ with open(table_path, 'w') as f:
+ f.write(new_ddl)
+
+ self.updated_ddls[db + table] = new_ddl
+
+ def analyze_performance(self):
+ # Print the number of tables updated
+ print(f"Number of tables updated: {len(self.updated_ddls)}")
+ # Print the number of errors
+ print(f"Number of errors: {len(self.errors)}")
+ # Print the errors with create view found in ddl
+ print("Number of view errors: ", len([i for i in self.errors.values() if "create view found in ddl" in i]))
+ # Print the errors with location found in ddl
+ print("Number of location errors: ", len([i for i in self.errors.values() if "location found in ddl" in i]))
+
+
+def parser():
+ parser = argparse.ArgumentParser(description='Update metastore logs')
+ parser.add_argument('--metastore_logs', type=str, help='Path to metastore logs', required=True)
+ parser.add_argument('--root_bucket', type=str, help='Root bucket name', required=False)
+ parser.add_argument('--mount_point', type=str, help='Mount point', required=False)
+ parser.add_argument('--database_details_log', type=str, help='Database details log', required=False)
+ args = parser.parse_args()
+ return args
+
+if __name__ == '__main__':
+ args = parser()
+ updater = MetastoreUpdater(args.metastore_logs, args.root_bucket, args.mount_point, args.database_details_log)
+ updater.duplicate_metastore_as_backup()
+ updater.update_metastore()
+ updater.analyze_performance()
diff --git a/utils/create_asset_mapping_spreadsheet.py b/utils/create_asset_mapping_spreadsheet.py
new file mode 100644
index 0000000..9686b52
--- /dev/null
+++ b/utils/create_asset_mapping_spreadsheet.py
@@ -0,0 +1,75 @@
+'''
+This script creates an asset mapping Excel spreadsheet using the csvs created by convert_all_logs.py
+Each csv gets its own sheet and adds a tag column for the customer to tag assets
+The resulting .xlsx file can be imported into a Google Sheet
+'''
+
+import os, csv, openpyxl
+
+
+def csv_to_excel(input_folder):
+
+ # Instantiate a new Excel workbook
+ workbook = openpyxl.Workbook()
+
+ # Loop through csv's and read each one line-by-line using csv.reader
+ for csv_file in os.listdir(input_folder):
+ csv_data = []
+
+ full_path = input_folder + '/' + csv_file
+
+ with open(full_path) as file_obj:
+ reader = csv.reader(file_obj)
+ for row in reader:
+ csv_data.append(row)
+
+ # Use the name of the csv file as the sheet name
+ sheet_name = os.path.splitext(csv_file)[0]
+
+ # Create new sheet
+ workbook.create_sheet(title=sheet_name)
+ sheet = workbook[sheet_name]
+
+ # Insert csv data into sheet
+ for row in csv_data:
+ sheet.append(row)
+
+ # Insert a tag column before column at index 2
+ sheet.insert_cols(2)
+ sheet['B1'] = 'tag'
+
+ # Freeze the first row and first two columns
+ sheet.freeze_panes = sheet['C2']
+
+ # Resizing columns to fit cell contents, has a max value in case columns are very wide
+ for col in sheet.columns:
+ max_length = 0
+ column = col[0].column_letter # Get the column name
+ for cell in col:
+ try: # Necessary to avoid error on empty cells
+ if len(str(cell.value)) > 0:
+ max_length = len(str(cell.value))
+ except:
+ pass
+
+ # Copied this formula from an example, seems to do a good job
+ adjusted_width = round((max_length + 2) * 1.2, 0)
+
+ # Keeps column widths from getting too large, 75 is arbitrary
+ if adjusted_width > 75:
+ adjusted_width = 75
+
+ sheet.column_dimensions[column].width = adjusted_width
+
+ # Remove the default sheet
+ default_sheet = workbook['Sheet']
+ workbook.remove(default_sheet)
+
+ # Save the Excel file
+ workbook.save("asset_mapping.xlsx")
+
+def main():
+ csv_to_excel(os.getcwd() + '/csv')
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/utils/create_sample_jobs_new.py b/utils/create_sample_jobs_new.py
new file mode 100644
index 0000000..0374072
--- /dev/null
+++ b/utils/create_sample_jobs_new.py
@@ -0,0 +1,87 @@
+import json
+import pandas as pd
+import csv
+import os
+import datetime
+import argparse
+
+def read_log(file_name):
+ try:
+ with open("./all_jobs/"+file_name) as f:
+ data = f.read().split("\n")
+ return data[:-1]
+ except FileNotFoundError as e:
+ return ''
+ except Exception as e:
+ print("Error while reading file:", file_name, "\n", e)
+ return ''
+
+def move_logs(timestamp=""):
+ # moving all_jobs
+ os.rename("jobs.log", f"./all_jobs/jobs{timestamp}.log")
+ os.rename("acl_jobs.log", f"./all_jobs/acl_jobs{timestamp}.log")
+
+def write_job_log(data, sample_job_ids):
+ with open("jobs.log", "w") as jl:
+ for d in data:
+ try:
+ d = json.loads(d)
+ if d['job_id'] in sample_job_ids:
+ jl.write(json.dumps(d) + "\n")
+ except:
+ print("Error while writing jobs.log")
+
+
+def write_job_acls_log(data, sample_job_ids):
+ with open("acl_jobs.log", "w") as jal:
+ for d in data:
+ try:
+ d = json.loads(d)
+ if int(d['object_id'].split("/")[-1]) in sample_job_ids:
+ jal.write(json.dumps(d) + "\n")
+ except:
+ print("Error while writing acl_jobs.log")
+
+def write_rest_job_logs(jobslog, acljobslog, sample_job_ids):
+ with open("other_jobs.log", "w") as ojl:
+ for d in jobslog:
+ try:
+ d = json.loads(d)
+ if d['job_id'] not in sample_job_ids:
+ ojl.write(json.dumps(d) + "\n")
+ except:
+ print("Error while writing other_jobs.log")
+
+ with open("other_acl_jobs.log", "w") as ojal:
+ for d in acljobslog:
+ try:
+ d = json.loads(d)
+ if int(d['object_id'].split("/")[-1]) not in sample_job_ids:
+ ojal.write(json.dumps(d) + "\n")
+ except:
+ print("Error while writing other_acl_jobs.log")
+
+def main():
+
+ job_ids = [410104035299, 30596903773550, 97211745563636]
+
+ if "all_jobs" not in os.listdir():
+ os.mkdir("./all_jobs/")
+ move_logs()
+ elif "jobs.log" in os.listdir():
+ ts = datetime.datetime.now()
+ move_logs("_"+str(ts))
+
+ #json objects
+ job_log_data = read_log("jobs.log")
+ job_acl_log_data = read_log("acl_jobs.log")
+
+ #move jobs.log into ./alljobs folder + write sample jobs log in main logs folder
+ write_job_log(job_log_data, job_ids)
+ write_job_acls_log(job_acl_log_data, job_ids)
+
+ #write jobs.log that only contains jobs NOT in sample jobs log
+ write_rest_job_logs(job_log_data, job_acl_log_data, job_ids)
+
+if __name__ == "__main__":
+ main()
diff --git a/utils/create_workspace.py b/utils/create_workspace.py
new file mode 100644
index 0000000..6992ad9
--- /dev/null
+++ b/utils/create_workspace.py
@@ -0,0 +1,118 @@
+from utils.split_logs import Split
+import os
+import shutil
+import pandas as pd
+from datetime import datetime
+
+class Workspace():
+ def __init__(self, checkpoint, workspace, all_workspaces, default_owner=False, tag='Y'):
+ self.path = "./logs/"+checkpoint+"/"
+ self.workspace = str(workspace)
+ self.new_path = "./logs/"+checkpoint+"_"+workspace+"/"
+ self.workspaces = all_workspaces
+ self.checkpoint = checkpoint
+ self.tag = tag
+ split = Split(checkpoint, workspace, default_owner)
+
+ # this is where all assets are mapped to what csv they refer to + what function they use for the split
+ self.map = {
+ 'users': ["users", split.users],
+ 'instance_pools' : ["instance_pools", split.instance_pools],
+ 'instance_profiles': ["instance_profiles", split.instance_profiles],
+ 'groups': ["groups", split.groups],
+ 'jobs': ["jobs", split.jobs],
+ 'acl_jobs': ["jobs", split.acl_jobs],
+ 'secret_scopes': ["secret_scopes", split.secret_scopes],
+ 'secret_scopes_acls':["secret_scopes", split.secret_scopes_acls],
+ 'clusters': ["clusters", split.clusters],
+ 'cluster_policies': ["clusters", split.cluster_policy],
+ 'acl_clusters':["clusters", split.acl_clusters],
+ 'acl_cluster_policies': ["clusters", split.acl_cluster_policies],
+ 'mounts': ["mounts", split.mounts],
+ 'shared_notebooks': ["global_shared_logs", split.shared_notebooks],
+ 'global_notebooks': ["global_logs", split.global_notebooks],
+ 'user_notebooks': ["users", split.user_notebooks],
+ 'user_dirs': ["users", split.user_dirs],
+ 'user_workspace': ["users", split.user_workspace],
+ 'acl_notebooks':["users", split.acl_notebooks],
+ 'acl_directories':["users", split.acl_directories],
+ 'metastore': ["metastore", split.metastore],
+ 'success_metastore': ["metastore", split.success_metastore],
+ 'table_acls':["metastore", split.table_acls],
+ "database_details": ["metastore", split.database_details]
+ }
+ print("-"*80)
+ print(f"CREATING WORKSPACE {workspace}...")
+ self.create_workspace(workspace, checkpoint)
+
+ @staticmethod
+ def create_workspace(wk="test", checkpoint=""):
+ """
+ summary: creates a directory for each workspace
+ """
+ directories = os.listdir("./logs/")
+ name = checkpoint+"_"+wk
+ if name not in directories:
+ os.mkdir("./logs/"+name)
+ #print("Workspace directory {} was successfully created.".format(name))
+
+ def copy_other_files(self):
+ """
+ summary: copy files that need to be copied to all workspace folders
+ """
+ total = ['app_logs', 'checkpoint', 'source_info.txt']
+ for w in self.workspaces:
+ # don't copy the logs that were not in the csvs directory
+ total_in_workspace = os.listdir("./logs/"+self.checkpoint+"_"+w)
+ for file in total:
+ if file not in self.workspaces:
+ try:
+ # if it is a file, copy just that file. otherwise, copy all files recursively in it
+ if os.path.isfile("./logs/"+self.checkpoint+"/"+file):
+ #print(f"Copying file {file} to workspace {w}")
+ shutil.copy("./logs/"+self.checkpoint+"/"+file, "./logs/"+self.checkpoint+"_"+w+"/"+file)
+ else:
+ #print(f"Copying directory {file} to workspace {w}")
+ shutil.copytree("./logs/"+self.checkpoint+"/"+file, "./logs/"+self.checkpoint+"_"+w+"/"+file)
+ except Exception as e:
+ pass
+
+ def run(self):
+ """
+ summary: run each module for every asset
+ """
+ # for each
+ for m in self.map.keys():
+ try:
+ # get the asset function that splits that asset
+ module_function = self.map[m][1]
+ # get the appropriate csv that matches it
+ sheet = self.map[m][0]
+ # split_csv performs the actual split and outputs all csvs that were not in the csv directory
+ print(f"{datetime.now()} Working on {m}...")
+ success = self.split_csv(m, module_function, sheet, self.tag)
+
+ except Exception as e:
+ pass
+
+ print(f"{datetime.now()} Please review error logs in the {self.new_path}errors/ directory to confirm successful split. ")
+ return 0
+
+ def split_csv(self, module, module_function, sheet_name, tag="Y"):
+ # reads csv and inputs attribute columns where the workspace column is set to Y
+ # you can set that variable to True or 1 or anything else that the client is using
+ # but it will ignore anything else
+ df = pd.read_excel("asset_mapping.xlsx", sheet_name = sheet_name)
+ current_df = df[df[self.workspace] == tag]
+ # send that subset dataframe to the module function found in Split class
+ errors = module_function(current_df.reset_index())
+ #pushing all errors to a csv
+ if 'errors' not in os.listdir(self.new_path):
+ os.mkdir(self.new_path + 'errors')
+
+ er = pd.DataFrame(errors)
+ if len(er) > 0:
+ print(f"{datetime.now()} WARNING: There are {len(er)} errors. Please review error logs for {module}")
+ er.to_csv(self.new_path + 'errors/' + module + '.csv')
+ # success should be 0
+ return 0
diff --git a/utils/databricks_delete.py b/utils/databricks_delete.py
new file mode 100644
index 0000000..7453fb1
--- /dev/null
+++ b/utils/databricks_delete.py
@@ -0,0 +1,266 @@
+import argparse
+import requests
+import json
+import sys
+import time
+import os
+import datetime
+import configparser
+import re
+
+
+class Databricks(object):
+
+ def __init__(self, **kwargs):
+ profile = kwargs['profile'] if 'profile' in kwargs else 'DEFAULT'
+ login = self.get_login_credentials(profile)
+ url = login['host']
+ token = login['token']
+ self.host = self.url_validation(url)
+ self.token = token
+ print(f"Running on {self.host}")
+ self.check_file = kwargs['check_file'] if 'check_file' in kwargs else None
+ self.session = kwargs['session'] if 'session' in kwargs else None
+ self.retry_backoff = kwargs['retry_backoff'] if 'retry_backoff' in kwargs else 0.1
+
+ def progress(self, _cur, _max):
+ p = round(100*_cur/_max)
+ b = f"Progress: {_cur}/{_max}"
+ print(b, end="\r")
+
+ def collect_jobs(self):
+ host = self.host
+ token = self.token
+ jobs_list = requests.get("{db_url}/api/2.0/jobs/list".format(db_url=host), headers={
+ "Authorization": "Bearer {bearer_token}".format(bearer_token=token),
+ "Content-Type": "application/json"})
+ log_file = "./logs/delete_jobs.log"
+ logger = open(log_file, 'w+')
+ logger.write("NEW RUN LOGGED: " + str(datetime.datetime.now()) + "\n")
+ logger.write("..." * 5 + "\n")
+ jobs = jobs_list.json()['jobs']
+ job_ids = []
+ job_ids = [{'job_id': job['job_id'], 'created_time': job['created_time']} for job in jobs]
+ job_ids = sorted(job_ids, key=lambda i: i['job_id'])
+ job_names_e2 = [job['settings']['name'] for job in jobs]
+ print("Total jobs: " + str(len(job_ids)))
+ logger.write("Total jobs: " + str(len(job_ids)) + "\n")
+ print("..." * 5, end="\r")
+ job_names = []
+ if self.check_file:
+ with open(self.check_file) as f:
+ check_file = f.readlines()
+
+ check_file = [x.split(',')[1] for x in check_file]
+ check_file = [x.strip() for x in check_file]
+ print("Total jobs to check: " + str(len(check_file)))
+ print("..." * 5, end="\r")
+ for job in jobs:
+ if job['settings']['name'] in check_file:
+ job_names.append(job['settings']['name'])
+
+ skipped_jobs = [job for job in check_file if job not in job_names_e2]
+ print("Skipped jobs: " + str(len(skipped_jobs)))
+ job_ids = [{'job_id': job['job_id'], 'created_time': job['created_time']} for job in jobs if job['settings']['name'] in job_names]
+ logger.write("Total jobs to check: " + str(len(check_file)) + "\n")
+ logger.write("..." * 5 + "\n")
+ logger.write("Total jobs to delete: " + str(len(job_ids)) + "\n")
+ logger.write("..." * 5 + "\n")
+ logger.write("Not deleted jobs in E2: \n")
+ logger.write(','.join([json.dumps({'job_id': job['job_id'], 'job_name': job['settings']['name'], 'created_time': job['created_time']}) for job in jobs if job['settings']['name'] not in job_names]))
+ logger.write("\n")
+ logger.write("Deleted jobs in E2: \n")
+ logger.write(','.join([json.dumps({'job_id': job['job_id'], 'job_name': job['settings']['name'], 'created_time': job['created_time']}) for job in jobs if job['settings']['name'] in job_names]))
+ logger.write("\n")
+ logger.write("Check jobs not found in E2: \n")
+ logger.write(','.join(skipped_jobs))
+ logger.close()
+
+ print("Total jobs to delete: " + str(len(job_ids)))
+ print("List of job names to delete: " + str(job_names))
+ user_response = input("Do you want to continue (y/n): ")
+ if str(user_response).lower() != 'y':
+ sys.exit(1)
+ return job_ids
+
+ def collect_clusters(self):
+ host = self.host
+ token = self.token
+ clusters_list = requests.get("{db_url}/api/2.0/clusters/list".format(db_url=host), headers={
+ "Authorization": "Bearer {bearer_token}".format(bearer_token=token),
+ "Content-Type": "application/json"})
+ clusters = clusters_list.json()['clusters']
+ cluster_ids = [{'cluster_id': cluster['cluster_id'], 'state': cluster['state']} for cluster in clusters]
+ cluster_ids = sorted(cluster_ids, key=lambda i: i['cluster_id'])
+ print("Total clusters: " + str(len(cluster_ids)))
+ print("..." * 5, end="\r")
+ return cluster_ids
+
+ def delete_clusters(self):
+ host = self.host
+ token = self.token
+
+ cluster_ids = self.collect_clusters()
+ output_file = f"./logs/{self.session}/delete_clusters.log"
+ fd = open(output_file, 'a+')
+ print("*" * 80, file=fd)
+ print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd)
+ print("cluster_id,status", file=fd)
+ cluster_num = 0
+ cluster_max = len(cluster_ids)
+ for cluster_id in cluster_ids:
+ if cluster_id['state'] == 'RUNNING':
+ print("Cluster " + str(cluster_id['cluster_id']) + " is running. So not deleting this cluster")
+ self.progress(cluster_num, cluster_max)
+ cluster_num += 1
+ continue
+ data = {
+ "cluster_id": "{cluster_id}".format(cluster_id=cluster_id['cluster_id'])
+ }
+ result = requests.post("{db_url}/api/2.0/clusters/delete".format(db_url=host), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"}, json=data)
+ print("{cluster_id},{status}".format(cluster_id=cluster_id, status=result.status_code), file=fd)
+ self.progress(cluster_num, cluster_max)
+ cluster_num += 1
+ print("..." * 5, end="\r")
+ print("Done")
+ fd.close()
+
+ def progress_bar(self, current, total, starttime, currenttime, barLength = 20):
+ percent = (current / total) * 100
+ arrow = '-' * int(percent / 100 * barLength - 1) + '>'
+ spaces = ' ' * (barLength - len(arrow))
+ # want to do two decimal points
+ time_elapsed = currenttime - starttime
+ time_remaining = (time_elapsed / (current + 1)) * (total - (current + 1))
+ time_remaining_fmt = str(datetime.timedelta(seconds=time_remaining))
+ print(f'Progress: [{arrow + spaces}] {percent:.2f}% Estimated time remaining: {time_remaining_fmt}', end='\r')
+
+ def delete_jobs(self):
+ host = self.host
+ token = self.token
+
+ job_ids = self.collect_jobs()
+ output_file = f"./logs/{self.session}/delete_jobs.log"
+ fd = open(output_file, 'a+')
+ print("*" * 80, file=fd)
+ print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd)
+ print("job_id,status", file=fd)
+ job_num = 0
+ job_max = len(job_ids)
+ for job_id in job_ids:
+ job_runs = requests.get("{db_url}/api/2.0/jobs/runs/list?job_id={jobid}&active_only=true".format(db_url=host, jobid=job_id['job_id']), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"})
+ if job_runs.status_code == 200 and "runs" in job_runs.json():
+ print("Job " + str(job_id['job_id']) + " is active. So not deleting this job")
+ self.progress(job_num, job_max)
+ job_num += 1
+ continue
+ data = {
+ "job_id": "{job_id}".format(job_id=job_id['job_id'])
+ }
+ result = requests.post("{db_url}/api/2.0/jobs/delete".format(db_url=host), headers={"Authorization": "Bearer {bearer_token}".format(bearer_token=token), "Content-Type": "application/json"}, json=data)
+ print("{job_id},{status}".format(job_id=job_id, status=result.status_code), file=fd)
+ self.progress(job_num, job_max)
+ job_num += 1
+ print("..." * 5, end="\r")
+ print("Done")
+ fd.close()
+
+ def read_log_file(self, log_file):
+ with open(log_file, 'r') as f:
+ return f.readlines()
+
+ def delete_workspace_obj(self, path):
+ url = self.host
+ token = self.token
+ api_url = f"{url}/api/2.0/workspace/delete"
+ fd = open(f"./logs/{self.session}/delete_notebooks.log", 'a+')
+ print("Deleting: " + path, file=fd)
+ payload = {'path': path}
+ headers = {
+ 'Authorization': f'Bearer {token}',
+ 'Content-Type': 'application/json'
+ }
+ response = requests.post(api_url, headers=headers, json=payload)
+ print(response.text, file=fd)
+ fd.close()
+ return response
+
+ def delete_notebooks(self):
+ host = self.host
+ token = self.token
+ fd = open(f"./logs/{self.session}/delete_notebooks.log", 'a+')
+ print("*" * 80, file=fd)
+ print("NEW RUN LOGGED: " + str(datetime.datetime.now()), file=fd)
+ fd.close()
+ notebooks_list = self.read_log_file(f"./logs/{self.session}/user_workspace.log")
+ print("Total notebooks: " + str(len(notebooks_list)))
+ total = len(notebooks_list)
+ starting_Time = time.time()
+ for i, notebook in enumerate(notebooks_list):
+ time.sleep(self.retry_backoff)
+ current_time = time.time()
+ self.progress_bar(i, total, starting_Time, current_time)
+ response = self.delete_workspace_obj(json.loads(notebook).get("path"))
+
+ def get_url_token(self):
+ return self.url, self.token
+
+ def url_validation(self, url):
+ if '/?o=' in url:
+ # if the workspace_id exists, lets remove it from the URL
+ url = re.sub("/?o=.*", '', url)
+ elif 'net/' == url[-4:]:
+ url = url[:-1]
+ elif 'com/' == url[-4:]:
+ url = url[:-1]
+ return url.rstrip("/")
+
+ def get_login_credentials(self, profile='DEFAULT'):
+ creds_path = '~/.databrickscfg'
+ config = configparser.ConfigParser()
+ abs_creds_path = os.path.expanduser(creds_path)
+ config.read(abs_creds_path)
+ try:
+ current_profile = dict(config[profile])
+ if not current_profile:
+ raise ValueError(f"Unable to find a defined profile to run this tool. Profile '{profile}' not found.")
+ return current_profile
+ except KeyError:
+ raise ValueError(
+ 'Unable to find credentials to load for profile. Profile only supports tokens.')
+
+
+class InputHandler(object):
+ def __init__(self):
+ pass
+
+ def get(self):
+ parser = argparse.ArgumentParser(description='Delete databricks Jobs')
+ parser.add_argument('-p', '--profile', dest='profile', required=True, help="Databricks Server URL")
+ parser.add_argument('-c', '--check-file', dest='check_file', required=False, help="Check for job name in file")
+ parser.add_argument('-s', '--session', dest='session', required=False, help="Session name")
+ parser.add_argument('-t', '--task', dest='task', required=False, help="Task to perform. One of 'delete_jobs', 'delete_notebooks', 'delete_clusters'", default='delete_jobs')
+ parser.add_argument('--retry-backoff', dest='retry_backoff', required=False, help="Retry backoff time", default=1.0)
+
+ parse_input = parser.parse_args()
+
+ if not parse_input.check_file and parse_input.task == 'delete_jobs':
+ print("Check file not provided or not found")
+ user_response = input("Do you want to continue without check file (y/n): ")
+ if user_response.lower() != 'y':
+ parser.print_help()
+ sys.exit(1)
+
+ return parse_input
+
+
+if __name__ == '__main__':
+ input_handler = InputHandler()
+ parse_input = input_handler.get()
+ dbObj = Databricks(profile=parse_input.profile, check_file=parse_input.check_file, session=parse_input.session, retry_backoff=parse_input.retry_backoff)
+ if parse_input.task == 'delete_jobs':
+ dbObj.delete_jobs()
+ elif parse_input.task == 'delete_notebooks':
+ dbObj.delete_notebooks()
+ elif parse_input.task == 'delete_clusters':
+ dbObj.delete_clusters()
\ No newline at end of file
diff --git a/utils/ff_view_tblprop.py b/utils/ff_view_tblprop.py
new file mode 100644
index 0000000..e53bd08
--- /dev/null
+++ b/utils/ff_view_tblprop.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import numpy as np
+import re
+import os
+
+print("Directory to Loop Through: ")
+basePath = input("> ")
+
+folderName = os.path.basename(basePath)
+
+def fix_schema_errors(basePath: str):
+
+ tbl_pattern = r"TBLPROPERTIES \([^()]*\)"
+ # ddl_pattern = "\([^()]*\)"
+
+
+ print(f'Working on: {folderName} ...')
+ directory = os.fsencode(basePath)
+
+ for file in os.listdir(directory):
+ fileName = os.fsdecode(file)
+ print(fileName)
+ try:
+ with open(fileName, "r") as f:
+ print(f"Opened file {fileName}")
+ ddl = f.read()
+ print(ddl)
+ if re.search(tbl_pattern, ddl):
+ ddl = re.sub(tbl_pattern, '', ddl)
+ with open(fileName, 'w') as file:
+ file.write(ddl)
+
+ except AttributeError:
+ print('Failure')
+
+fix_schema_errors(basePath)
\ No newline at end of file
diff --git a/utils/force_fix_schema.py b/utils/force_fix_schema.py
new file mode 100644
index 0000000..62402c4
--- /dev/null
+++ b/utils/force_fix_schema.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import numpy as np
+import re
+import os
+
+print("Directory to Loop Through: ")
+basePath = input("> ")
+print("Catalog to remove from DDL: ")
+namespace = input("> ")
+
+folderName = os.path.basename(basePath)
+
+def fix_schema_errors(basePath: str, namespace: str):
+ print('\n')
+ print(f'Applying schema mismatch fixes to {folderName} table.')
+ loc_pattern = "LOCATION '.*'"
+ tbl_pattern = "TBLPROPERTIES.*"
+ # ddl_pattern = "\([^()]*\)"
+
+
+ print(f'Working on: {folderName} ...')
+ directory = os.fsencode(basePath)
+
+ for file in os.listdir(directory):
+ fileName = os.fsdecode(file)
+ print(fileName)
+ try:
+ with open(fileName, "r") as f:
+ print(f"Opened file {fileName}")
+ ddl = f.read()
+ print(ddl)
+ # x = re.search(loc_pattern, ddl)
+ print(f"Removing {namespace} from Create Statement")
+ ddl = re.sub(f'{namespace}.', '', ddl)
+ ddl = re.sub(r'\([^()]*\)', '', ddl)
+ ddl = re.sub(tbl_pattern, '', ddl)
+
+ # if x:
+ # print(f"Removing {namespace} from Create Statement")
+ # ddl = re.sub(f'{namespace}.', '', ddl)
+ # print('Removing schema definition from ddl')
+ # ddl = re.sub(r'\([^()]*\)', '', ddl)
+ # if re.search(tbl_pattern, ddl):
+ # ddl = re.sub(tbl_pattern, '', ddl)
+ # else:
+ # print(f'No Location in DDL in {fileName}, skipping...')
+ with open(fileName, 'w') as file:
+ file.write(ddl)
+
+ except AttributeError:
+ print('Failure')
+
+fix_schema_errors(basePath, namespace)
\ No newline at end of file
diff --git a/utils/force_fix_schema_location_check.py b/utils/force_fix_schema_location_check.py
new file mode 100644
index 0000000..3a50cb8
--- /dev/null
+++ b/utils/force_fix_schema_location_check.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import numpy as np
+import re
+import os
+
+print("Directory to Loop Through: ")
+basePath = input("> ")
+print("Catalog to remove from DDL: ")
+namespace = input("> ")
+
+folderName = os.path.basename(basePath)
+
+def fix_schema_errors(basePath: str, namespace: str):
+ print('\n')
+ print(f'Applying schema mismatch fixes to {folderName} table.')
+ loc_pattern = "LOCATION '.*'"
+ tbl_pattern = "TBLPROPERTIES.*"
+ # ddl_pattern = "\([^()]*\)"
+
+
+ print(f'Working on: {folderName} ...')
+ directory = os.fsencode(basePath)
+
+ for file in os.listdir(directory):
+ fileName = os.fsdecode(file)
+ print(fileName)
+ try:
+ with open(fileName, "r") as f:
+ print(f"Opened file {fileName}")
+ ddl = f.read()
+ print(ddl)
+ x = re.search(loc_pattern, ddl)
+ print(f"Removing {namespace} from Create Statement")
+ # ddl = re.sub(f'{namespace}.', '', ddl)
+ # ddl = re.sub(r'\([^()]*\)', '', ddl)
+ # ddl = re.sub(tbl_pattern, '', ddl)
+
+ if x:
+ print(f"Removing {namespace} from Create Statement")
+ ddl = re.sub(f'{namespace}.', '', ddl)
+ print('Removing schema definition from ddl')
+ ddl = re.sub(r'\([^()]*\)', '', ddl)
+ if re.search(tbl_pattern, ddl):
+ ddl = re.sub(tbl_pattern, '', ddl)
+ else:
+ print(f'No Location in DDL in {fileName}, skipping...')
+ with open(fileName, 'w') as file:
+ file.write(ddl)
+
+ except AttributeError:
+ print('Failure')
+
+fix_schema_errors(basePath, namespace)
\ No newline at end of file
diff --git a/utils/jobs_dbr_modification.py b/utils/jobs_dbr_modification.py
new file mode 100644
index 0000000..7bd1d88
--- /dev/null
+++ b/utils/jobs_dbr_modification.py
@@ -0,0 +1,76 @@
+import json
+import sys
+
+def modify_json_file(input_file, output_file, new_dbr_job_ids, new_spark_version, default_spark_version):
+ try:
+ with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+ # Read each line from the input file
+ for line in infile:
+ try:
+ # Parse the JSON string into a dictionary
+ data = json.loads(line)
+
+ # Modify the spark_version in the new_cluster
+ # if "settings" in data and "new_cluster" in data["settings"]:
+ # data["settings"]["new_cluster"]["spark_version"] = new_spark_version
+
+ job_id = data.get("job_id")
+
+
+ if job_id in new_dbr_job_ids:
+
+ spark_version_to_use = new_spark_version
+
+ else:
+
+ spark_version_to_use = default_spark_version
+
+
+ if "job_clusters" in data['settings']:
+ for i, job_cluster in enumerate(data['settings']["job_clusters"]):
+
+
+ data['settings']["job_clusters"][i]['new_cluster']['spark_version'] = spark_version_to_use
+
+
+ if "tasks" in data["settings"].keys():
+ # Multi-task
+ for i, task in enumerate(data["settings"]["tasks"]):
+
+
+ if "new_cluster" in task:
+
+ data["settings"]["tasks"][i]["new_cluster"]['spark_version'] = spark_version_to_use
+
+
+ else:
+ # Single-task
+
+ if "new_cluster" in data['settings'].keys():
+
+ data["settings"]["new_cluster"]['spark_version'] = spark_version_to_use
+
+
+ # Convert the modified dictionary back into a JSON string
+ modified_json_line = json.dumps(data)
+
+ # Write the modified JSON to the output file
+ outfile.write(modified_json_line + '\n')
+
+ except json.JSONDecodeError as e:
+ print(f"Error decoding JSON: {e}", file=sys.stderr)
+
+ except IOError as e:
+ print(f"Error opening or writing to file: {e}", file=sys.stderr)
+
+if __name__ == "__main__":
+ # Replace 'input.json' and 'output.json' with your actual file paths
+ input_file = './jobs_logs_testing/LL_jobs.log'
+ output_file = './jobs_logs_testing/LL_updated_jobs.log'
+
+ new_dbr_job_ids = [1009, 863]
+
+ # Modify the JSON file with the new spark version
+ modify_json_file(input_file, output_file, new_dbr_job_ids, new_spark_version="15.4.x-scala2.12", default_spark_version= "14.3.x-scala2.12")
+
+ print(f"Modified JSON written to {output_file}")
diff --git a/utils/rename_emails.py b/utils/rename_emails.py
new file mode 100644
index 0000000..2601d64
--- /dev/null
+++ b/utils/rename_emails.py
@@ -0,0 +1,146 @@
+import argparse
+import os
+import shutil
+
+def to_dict(csv_file):
+ """
+ summary: converts a csv or text file (or another comma delim file) into a
+ dictionary object
+
+ PARAMETERS:
+ csv_file: path file of the comma delim file, assumes that there are no column
+ headings, each user address is split by a new line, and the old and new
+ address are split by a comma in that order.
+
+ RETURNS:
+ dict_from_csv: dictionary object where key is the old item and value
+ is new item
+ """
+ import csv
+
+ dict_from_csv = {}
+ with open(csv_file, mode='r') as f:
+ reader = csv.reader(f)
+ # assuming that each row is "old address, new address" for a user
+ dict_from_csv = {rows[2]:rows[6] for rows in reader}
+ del dict_from_csv['userName']
+ return dict_from_csv
+
+def map(file_name, mapping):
+ """
+ summary: reads parameter file_name and replaces all places where previous email
+ address is used with the new item as indicated in mapping
+
+ PARAMETERS:
+ file_name: path of the file that is to be read
+ mapping: dict where key is the previous item and value is the
+ new item
+
+ RETURNS:
+ data: a text object
+
+ """
+ with open(file_name, "r") as f:
+ data = f.read()
+ #print(f"Currently mapping {file_name}")
+ for e in mapping:
+ data = data.replace(e, mapping[e])
+ return data
+
+def write(file_name, data_write):
+ """
+ summary: writes parameter data_write to the path indicated by parameter
+ file_name
+
+ PARAMETERS:
+ file_name: path of the file that is to be written
+ data_write: text object
+
+ RETURNS:
+ n/a
+ """
+ with open(file_name, "w") as f:
+ f.write(data_write)
+
+def rename_users_folder(mapping):
+ """
+ summary: renames the user folder by moving all files to new directory
+
+ PARAMETERS:
+ mapping: dict where key is the previous item and value is the
+ new item
+
+ RETURNS:
+ n/a
+ """
+ import shutil
+
+ users = os.listdir('./artifacts/Users')
+ for u in users:
+ if '.DS_Store' not in u:
+ if mapping.get(u, False):
+ shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+mapping[u])
+ else:
+ shutil.move("./artifacts/Users/"+u, "./artifacts/NewUsers/"+u)
+
+ os.rename("./artifacts/Users", "./artifacts/EmptyDir") # this is an empty dir
+ os.rename("./artifacts/NewUsers", "./artifacts/Users")
+
+
+def mapping_file(file_name, mapping):
+ """
+ summary: maps a single file and writes it to a new file and saves the old
+ log file with the '_prev' suffix
+
+ PARAMETERS:
+ file_name: path of the file to map
+ mapping: dict where key is the previous item and value is the
+ new item
+
+ RETURNS:
+ n/a
+ """
+ # this code here (directly referencing the number 4) assumes that the file name
+ # has the 3 letter extension (e.g. something.txt or something.csv
+ data = map(file_name, mapping)
+ write(file_name, data)
+
+def main():
+ all_args = argparse.ArgumentParser()
+ all_args.add_argument("--dir", "--file", dest="directory", required=True, help='directory needs to be updated via mapping.')
+ all_args.add_argument("-m", "--mapping", dest="mapping", required=True, help='one-to-one mapping provided by a comma delim file')
+
+ args = all_args.parse_args()
+ file_name = args.file
+ mapping_file_ = args.mapping
+
+ mapping = to_dict(mapping_file_)
+ print("Mapping: ")
+ print(mapping)
+ print("--------------------")
+ yesno = input("Confirm mapping (y/n): ")
+ if yesno.lower() != "y":
+ exit()
+
+ # change the current working director to specified path
+ os.chdir(file_name)
+ # verify the path using getcwd()
+ cwd = os.getcwd()
+ print("Current working directory is:", cwd)
+
+ logs = os.listdir()
+
+ for file in logs:
+ # making sure we are only getting the logs
+ if ".log" in file:
+ mapping_file(file, mapping)
+ if "groups" == file:
+ groups = os.listdir("groups")
+ for g in groups:
+ mapping_file("/groups/"+g, mapping)
+
+
+ rename_users_folder(mapping)
+
+if __name__ == "__main__":
+ main()
diff --git a/utils/search_and_replace.py b/utils/search_and_replace.py
new file mode 100644
index 0000000..4979921
--- /dev/null
+++ b/utils/search_and_replace.py
@@ -0,0 +1,23 @@
+import os
+
+print("Directory to Loop Through: ")
+basePath = input("> ")
+
+print("Text to find: ")
+texToFind = input("> ")
+
+print('Text to us as replacement: ')
+replacementText = input(r'> ')
+
+directory = os.fsencode(basePath)
+
+for file in os.listdir(directory):
+ fileName = os.fsdecode(file)
+ with open(fileName, 'r') as file:
+ filedata = file.read()
+
+ filedata = filedata.replace(texToFind, replacementText)
+
+ print(f'Replacing occurences of {texToFind} with {replacementText} in file {fileName}')
+ with open(fileName, 'w') as file:
+ file.write(filedata)
\ No newline at end of file
diff --git a/utils/split_logs.py b/utils/split_logs.py
new file mode 100644
index 0000000..5c5f115
--- /dev/null
+++ b/utils/split_logs.py
@@ -0,0 +1,596 @@
+import json
+import os
+import shutil
+import pandas as pd
+import gzip
+from datetime import datetime
+
+class Split():
+ def __init__(self, checkpoint, workspace, default_owner=False):
+ self.path = "./logs/"+checkpoint+"/"
+ self.workspace = workspace
+ self.new_path = "./logs/"+checkpoint+"_"+workspace+"/"
+ self.imported_users = []
+ self.imported_groups = ['admins', 'Users']
+ self.default_job_owner = default_owner
+
+ def read_log(self, file_name):
+ """
+ summary: reads a given log
+ """
+ try:
+ with open(self.path+file_name) as f:
+ data = f.read().split("\n")
+ return data
+ except FileNotFoundError as e:
+ return print(f"{datetime.now()} Error: {file_name} not found. ")
+ except Exception as e:
+ print(f"{datetime.now()} Error: There was an unknown error reading {file_name}. ")
+ #print(e)
+ return ''
+
+ def write_logs(self, log, file_name):
+ """
+ summary: function to write a dict to a 'json' log in the same way that
+ the original logs are written
+ """
+ file_path = self.new_path+file_name
+
+ with open(file_path, 'w') as f:
+ for l in log:
+ f.write(json.dumps(l) + '\n')
+
+ def fix_acls(self, acls, jobs=False):
+ new_acls = []
+ for permission in acls:
+ if 'group_name' in permission.keys():
+ if permission['group_name'] in self.imported_groups:
+ new_acls.append(permission)
+ if 'user_name' in permission.keys():
+ if permission['user_name'] in self.imported_users:
+ new_acls.append(permission)
+ else:
+ # user will get dropped
+ if jobs:
+ if permission['all_permissions'][0]['permission_level'] == 'IS_OWNER':
+ if self.default_job_owner:
+ default_permission = {"user_name": self.default_job_owner, "all_permissions": [{"permission_level": "IS_OWNER", "inherited": False}]}
+ new_acls.append(default_permission)
+ else:
+ return 0
+ if 'principal' in permission.keys():
+ if permission['principal'] in self.imported_users:
+ new_acls.append(permission)
+ if 'userName' in permission.keys():
+ if permission['userName'] in self.imported_users:
+ new_acls.append(permission)
+ if 'display' in permission.keys():
+ if permission['display'] in self.imported_groups:
+ new_acls.append(permission)
+ return new_acls
+
+ def users(self, df, file_name="users.log"):
+ self.imported_users = []
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['emails'][0]['value'] in df['userName'].tolist():
+ data_write.append(d)
+ self.imported_users.append(d['emails'][0]['value'])
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+
+ def instance_pools(self, df, file_name="instance_pools.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['instance_pool_id'] in df['instance_pool_id'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+
+ def secret_scopes(self, df, file_name=None):
+ scopes = df["secret_scopes"]
+ errors = {'Data':[], 'Error':[]}
+ for scope in scopes:
+ try:
+ if "secret_scopes" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+"secret_scopes")
+ new_file_path = self.new_path+"secret_scopes/"+scope
+ src_path = self.path+"secret_scopes/"+scope
+ shutil.copyfile(src_path,new_file_path)
+ except Exception as e:
+ errors['Data'].append(scope)
+ errors['Error'].append(e)
+ return errors
+
+ def secret_scopes_acls(self, df, file_name="secret_scopes_acls.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['scope_name'] in df['secret_scopes'].tolist():
+ d['items'] = self.fix_acls(d['items'])
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def clusters(self, df, file_name = "clusters.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['cluster_name'] in df['cluster_name'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def cluster_policy(self, df, file_name = "cluster_policies.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['policy_id'] in df['policy_id'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def acl_clusters(self, df, file_name = "acl_clusters.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ cluster = d['object_id'].split("/")[-1]
+ if cluster in df['cluster_id'].tolist():
+ if "access_control_list" in d.keys():
+ d['access_control_list'] = self.fix_acls(d['access_control_list'])
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def acl_cluster_policies(self, df, file_name = "acl_cluster_policies.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ policy = d['object_id'].split("/")[-1]
+ if policy in df['policy_id'].tolist():
+ data_write.append(d)
+ if "access_control_list" in d.keys():
+ d['access_control_list'] = self.fix_acls(d['access_control_list'])
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def jobs(self, df, file_name="jobs.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['job_id'] in df['job_ids'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def acl_jobs(self, df, file_name="acl_jobs.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ jobid = d['object_id'].split("/")[-1]
+ if int(jobid) in df['job_ids'].tolist():
+ # print(f"{datetime.now()} - Editing Job with Job ID: {jobid}")
+ if "access_control_list" in d.keys():
+ d['access_control_list'] = self.fix_acls(d['access_control_list'], jobs=True)
+ if d['access_control_list'] == 0:
+ errors['Data'].append(jobid)
+ errors['Error'].append("Job Owner is not tagged in the asset mapping.")
+ continue
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def instance_profiles(self, df, file_name="instance_profiles.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['instance_profile_arn'] in df['instance_profile_arn'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def mounts(self, df, file_name='mounts.log'):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if d['path'] in df['mount_paths'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def groups(self, df, file_name=None):
+ groups = df['group_name']
+ errors = {'Data':[], 'Error':[]}
+ self.imported_groups = groups.tolist()
+
+ for group in groups:
+ try:
+ if "groups" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path + "groups/")
+
+ group_data = self.read_log("groups/" + group)
+ group_data_write = []
+ for d in group_data:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if "members" in d.keys():
+ d['members'] = self.fix_acls(d['members'])
+ group_data_write.append(d)
+ self.write_logs(group_data_write, "groups/" + group)
+ except Exception as e:
+ errors['Data'].append(group)
+ errors['Error'].append(e)
+ return errors
+
+ def user_dirs(self, df=None, file_name="user_dirs.log"):
+ data_user = df
+ user_names = data_user['userName'].tolist()
+ try:
+ data_art - pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs")
+ art_names = data_art['global_folder_names'].tolist()
+ except:
+ data_art = []
+ art_names = []
+ try:
+ data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs")
+ shared_names = data_shared['notebook_names'].tolist()
+ except:
+ data_shared = []
+ shared_names = []
+
+ data = self.read_log(file_name)
+ user_paths=['/Users/'+ n for n in user_names]
+ shared_paths=['/Shared/'+ n for n in shared_names]
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ if d != '':
+ try:
+ d = json.loads(d)
+ path = str(d['path'])
+ if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))):
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def user_workspace(self, df, file_name="user_workspace.log"):
+ data_user = df
+ user_names = data_user['userName'].tolist()
+
+ try:
+ data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs")
+ art_names = data_art['global_folder_names'].tolist()
+ except:
+ data_art = []
+ art_names = []
+ try:
+ data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs")
+ shared_names = data_shared['notebook_names'].tolist()
+ except:
+ data_shared = []
+ shared_names = []
+ data = self.read_log(file_name)
+ user_paths=['/Users/'+ n for n in user_names]
+ shared_paths=['/Shared/'+ n for n in shared_names]
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ if d != '':
+ try:
+ d = json.loads(d)
+ path = str(d['path'])
+ if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))):
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def shared_notebooks(self, df, file_name=None):
+ names = df['notebook_names']
+ errors = {'Data':[], 'Error':[]}
+ for notebook in names:
+ try:
+ if "artifacts" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+'artifacts')
+ if "Shared" not in os.listdir(self.new_path+"artifacts/"):
+ os.mkdir(self.new_path+'artifacts/Shared/')
+ new_folder_path = self.new_path+'artifacts/Shared/'+notebook
+ src_path = self.path+'artifacts/Shared/'+notebook
+ shutil.copytree(src_path,new_folder_path)
+ except Exception as e:
+ errors['Data'].append(notebook)
+ errors['Error'].append(e)
+ return errors
+
+ def global_notebooks(self, df, file_name=None):
+ names = df['global_folder_names']
+ errors = {'Data':[], 'Error':[]}
+ for notebook in names:
+ try:
+ if "artifacts" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+'artifacts')
+ new_folder_path = self.new_path+'artifacts/'+notebook
+ src_path = self.path+'artifacts/'+notebook
+ shutil.copytree(src_path,new_folder_path)
+ except Exception as e:
+ errors['Data'].append(notebook)
+ errors['Error'].append(e)
+ return errors
+
+ def user_notebooks(self, df, file_name=None):
+ errors = {'Data':[], 'Error':[]}
+ for u in self.imported_users:
+ try:
+ if "artifacts" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+'artifacts')
+ if "Users" not in os.listdir(self.new_path + "artifacts/"):
+ os.mkdir(self.new_path+'artifacts/Users/')
+
+ new_folder_path = self.new_path+'artifacts/Users/'+u
+ src_path = self.path+'artifacts/Users/'+u
+ shutil.copytree(src_path,new_folder_path)
+ except Exception as e:
+ errors['Data'].append(u)
+ errors['Error'].append(e)
+ return errors
+
+ def acl_notebooks(self, df, file_name="acl_notebooks.log"):
+ data_user = df
+ user_names = data_user['userName'].tolist()
+ try:
+ data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs")
+ art_names = data_art['global_folder_names'].tolist()
+ except Exception as e:
+ print(e)
+ data_art = []
+ art_names = []
+ try:
+ data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs")
+ shared_names = data_shared['notebook_names'].tolist()
+ except:
+ data_shared = []
+ shared_names = []
+
+ data = self.read_log(file_name)
+ user_paths=['/Users/'+ n for n in user_names]
+ shared_paths=['/Shared/'+ n for n in shared_names]
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+ for d in data:
+ if d != '':
+ try:
+ d = json.loads(d)
+ path = str(d['path'])
+ if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))):
+ if "access_control_list" in d.keys():
+ d['access_control_list'] = self.fix_acls(d['access_control_list'])
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def acl_directories(self, df, file_name="acl_directories.log"):
+ data_user = df
+ user_names = data_user['userName'].tolist()
+ try:
+ data_art = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_logs")
+ art_names = data_art['global_folder_names'].tolist()
+ except:
+ data_art = []
+ art_names = []
+ try:
+ data_shared = pd.read_excel("asset_mapping.xlsx", sheet_name = "global_shared_logs")
+ shared_names = data_shared['notebook_names'].tolist()
+ except:
+ data_shared = []
+ shared_names = []
+
+ data = self.read_log(file_name)
+ user_paths=['/Users/'+ n for n in user_names]
+ shared_paths=['/Shared/'+ n for n in shared_names]
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ if d != '':
+ try:
+ d = json.loads(d)
+ path = str(d['path'])
+ if (path[1:].startswith(tuple(art_names)) or path.startswith(tuple(user_paths)) or path.startswith(tuple(shared_paths))):
+ if "access_control_list" in d.keys():
+ d['access_control_list'] = self.fix_acls(d['access_control_list'])
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return errors
+
+ def metastore(self, df, file_name=None, split_tables=False):
+ databases = os.listdir(self.path + "metastore/")
+ errors = {'Data':[], 'Error':[]}
+ for dbtb in df['both'].tolist():
+ try:
+ db = dbtb.split(".")[0]
+ if "metastore" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+"metastore/")
+ new_folder_path = self.new_path+"metastore/"+db
+ src_path = self.path+"metastore/"+db
+ if split_tables:
+ tb = dbtb.split(".")[1]
+ new_file_path = new_folder_path + "/" + tb
+ src_file_path = src_path + "/" + tb
+ if tb not in os.listdir(new_folder_path):
+ shutil.copyfile(src_file_path, new_file_path)
+ else:
+ if db not in os.listdir(self.new_path+"metastore/"):
+ shutil.copytree(src_path, new_folder_path)
+ except Exception as e:
+ errors['Data'].append(db)
+ errors['Error'].append(e)
+ return errors
+
+ def success_metastore(self, df, file_name='success_metastore.log'):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ database = d['table'].split(".")[0]
+ if database in df['databases'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return 0
+
+ def database_details(self, df, file_name="database_details.log"):
+ data = self.read_log(file_name)
+ data_write = []
+ errors = {'Data':[], 'Error':[]}
+
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ database = d['Namespace Name']
+ if database in df['databases'].tolist():
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ self.write_logs(data_write, file_name)
+ return 0
+
+ def table_acls(self, df, file_name="logs/table_acls/00_table_acls.json.gz"):
+ errors = {'Data':[], 'Error':[]}
+ with gzip.open(file_name, 'rb') as f_in:
+ with open(self.path+"table_acls/00_table_acls.json", "wb") as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ data = self.read_log('table_acls/00_table_acls.json')
+ data_write = []
+ for d in data:
+ try:
+ if len(d) != 0:
+ d = d.strip()
+ d = json.loads(d)
+ if len(df.loc[(df['databases'] == d['Database'])]) > 0:
+ data_write.append(d)
+ except Exception as e:
+ errors['Data'].append(d)
+ errors['Error'].append(e)
+ if "table_acls" not in os.listdir(self.new_path):
+ os.mkdir(self.new_path+"table_acls")
+ file_path = self.new_path+"table_acls/00_table_acls.json"
+ with open(file_path, 'w') as f:
+ json.dump(data_write, f)
+ return errors
diff --git a/utils/to_csv.py b/utils/to_csv.py
new file mode 100644
index 0000000..647d760
--- /dev/null
+++ b/utils/to_csv.py
@@ -0,0 +1,286 @@
+import pandas as pd
+import json
+import argparse
+import os
+
+def read_log(file_name, checkpoint):
+ try:
+ with open ("logs/" + checkpoint + "/" + file_name) as f:
+ data = f.read().split("\n")
+ return data[:-1]
+
+ except FileNotFoundError as e:
+ return 1
+ except Exception as e:
+ print(f"Error while reading {file_name}...")
+ return ''
+
+def save_to_csv(data, file_name, destination):
+ try:
+ pd.DataFrame.from_dict(data).to_csv(f"./{destination}/{file_name}")
+ except:
+ print(f"Error while writing {file_name}...")
+
+
+def create_instance_profiles(data):
+ instance_profile_arn = []
+ for d in data:
+ try:
+ d = json.loads(d)
+ instance_profile_arn.append(d['instance_profile_arn'])
+ except Exception as e:
+ pass
+ return {'instance_profile_arn': instance_profile_arn}
+
+def create_instance_pools(data):
+ instance_pool_name = []
+ instance_pool_id = []
+
+ for d in data:
+ try:
+ d = json.loads(d)
+ instance_pool_name.append(d['instance_pool_name'])
+ instance_pool_id.append(d['instance_pool_id'])
+ except Exception as e:
+ pass
+
+ return {'instance_pool_name': instance_pool_name, 'instance_pool_id': instance_pool_id}
+
+def create_users(data):
+ userName = []
+ displayName = []
+
+ for d in data:
+ try:
+ d = json.loads(d)
+ if "userName" in d:
+ userName.append(d['userName'])
+ else:
+ userName.append(" ")
+ if "displayName" in d:
+ displayName.append(d['displayName'])
+ else:
+ displayName.append(" ")
+
+ except Exception as e:
+ pass
+
+ return {'userName': userName, 'displayName': displayName}
+
+def read_group(group_path):
+ try:
+ with open(group_path) as f:
+ data = f.read().split("\n")
+ return data
+ except FileNotFoundError as e:
+ return 1
+ except Exception as e:
+ print(f"Error while reading group at path {group_path}: {e}")
+ return 2
+
+def create_groups(directory_name = "groups", checkpoint = ""):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+
+ groups_path = f"./logs/{checkpoint}/{directory_name}/"
+ groups_dir = os.listdir(groups_path)
+ groups = {}
+
+ for g in groups_dir:
+ group_roles = []
+ group_members = []
+ group_users = []
+
+ data = read_group(groups_path + g)
+ if data == 1: # group not found
+ print(f"Group {g} not found in the checkpoint. Skipping...")
+ continue # to next group
+ if data == 2: # unknown error
+ continue
+ data = data[0]
+ d = json.loads(data)
+ group_name = d['displayName']
+
+ if 'roles' in d.keys():
+ roles = d['roles']
+ for role in roles:
+ group_roles.append(role['value'])
+
+ if 'members' in d.keys():
+ members = d['members']
+ for member in members:
+ group_members.append(member.get('display', 'display not found'))
+ group_users.append(member.get('userName', 'userName not found'))
+
+
+ groups[group_name] = [group_roles, group_members, group_users]
+ results = {}
+ total_names = []
+ total_group_roles = []
+ total_group_members = []
+ total_group_users = []
+
+ for k,v in groups.items():
+ total_names.append(k)
+ total_group_roles.append(v[0])
+ total_group_members.append(v[1])
+ total_group_users.append(v[2])
+ return {'group_name': total_names, 'group_roles': total_group_roles, 'group_members': total_group_members, 'group_users': total_group_users }
+
+
+def create_clusters(data):
+ cluster_id = []
+ cluster_name = []
+ creator_user_name = []
+ policy_id = []
+ instance_profile = []
+
+ for d in data:
+ try:
+ d = json.loads(d)
+ cluster_id.append(d['cluster_id'])
+ cluster_name.append(d['cluster_name'])
+ creator_user_name.append(d['creator_user_name'])
+ if "policy_id" in d.keys():
+ policy_id.append(d['policy_id'])
+ else:
+ policy_id.append(" ")
+ try:
+ instance_profile.append(d['aws_attributes']['instance_profile_arn'])
+ except:
+ instance_profile.append('')
+ except Exception as e:
+ print("Error in creating clusters...")
+
+ return {'cluster_id': cluster_id, 'cluster_name': cluster_name, 'creator_user_name': creator_user_name, 'policy_id': policy_id, 'instance_profile': instance_profile}
+
+def create_cluster_policies(data):
+ policy_id = []
+ policy_name = []
+
+ for d in data:
+ try:
+ d = json.loads(d)
+ policy_id.append(d['policy_id'])
+ policy_name.append(d['name'])
+ except Exception as e:
+ print("Error in creating cluster policies...")
+
+ return {'policy_id': policy_id, 'policy_name': policy_name}
+
+
+def create_jobs(data, jobs_acls):
+ job_ids = []
+ job_names = []
+ job_types = []
+ job_creators = []
+ job_owners = []
+ instance_profile = []
+
+ for d in data:
+ try:
+ d = json.loads(d)
+ job_ids.append(d['job_id'])
+ jn = d['settings']['name']
+ job_names.append(jn[:jn.index('::')])
+ try:
+ job_types.append(d['settings']['format'])
+ except:
+ job_types.append('N/A')
+ try:
+ job_creators.append(d['creator_user_name'])
+ except:
+ job_creators.append('N/A')
+ try:
+ instance_profile.append(d['settings']['new_cluster']['aws_attributes']['instance_profile_arn'])
+ except:
+ instance_profile.append('')
+ except Exception as e:
+ print("Error in creating jobs...")
+
+ if jobs_acls != 1: # if it was found in the session
+ for a in jobs_acls:
+ try:
+ a = json.loads(a)
+ for j in a['access_control_list']:
+ if j.get('user_name', None) != None:
+ if j['all_permissions'][0]['permission_level'] == 'IS_OWNER':
+ job_owners.append(j['user_name'])
+ except:
+ job_owners.append('')
+
+ return {'job_ids': job_ids, 'job_names': job_names, 'job_type':job_types, 'job_creator':job_creators, 'job_owner': job_owners, 'instance_profile': instance_profile}
+
+
+def create_shared_logs(directory_name = "artifacts/Shared", checkpoint = ""):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+ shared_path = f"./logs/{checkpoint}/{directory_name}"
+ notebooks = os.listdir(shared_path)
+ return {"notebook_names" : notebooks}
+
+def create_other_artifacts(directory_name = "artifacts", checkpoint = ""):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+ other_path = f"./logs/{checkpoint}/{directory_name}"
+ notebooks = os.listdir(other_path)
+ if "Users" in notebooks:
+ notebooks.remove("Users")
+ if "Shared" in notebooks:
+ notebooks.remove("Shared")
+ return {"global_folder_names" : notebooks}
+
+def create_libraries(data):
+ library_paths = []
+ library_names = []
+ for d in data:
+ if len(d) > 0:
+ d = json.loads(d)
+ library_paths.append(d['path'])
+ library_names.append(d['path'].split("/")[-1])
+ return {'library_paths': library_paths, 'library_names': library_names}
+
+def create_scopes(directory_name = "secret_scopes", checkpoint = ""):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+ secrets = os.listdir(f"./logs/{checkpoint}/{directory_name}/")
+ return {"secret_scopes" : secrets}
+
+def create_mounts(data):
+ mount_paths = []
+ mount_sources = []
+ for d in data:
+ try:
+ d = json.loads(d)
+ mount_paths.append(d['path'])
+ mount_sources.append(d['source'])
+ except Exception as e:
+ print("Error in mounts...")
+ return { 'mount_paths' : mount_paths, 'mount_sources' : mount_sources }
+
+
+def create_database(checkpoint = "", directory_name = 'metastore'):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+ metastore_path = f"./logs/{checkpoint}/{directory_name}"
+ return {'databases': [i for i in os.listdir(metastore_path) if i != ".DS_Store"]}
+
+
+def create_metastore(checkpoint = "", directory_name = 'metastore'):
+ if directory_name not in os.listdir(f"./logs/{checkpoint}/"):
+ return 1
+
+ metastore_path = f"./logs/{checkpoint}/{directory_name}"
+ try:
+ metastore_database = [i for i in os.listdir(metastore_path) if i != ".DS_Store"]
+ except:
+ print("metastore directory not found in checkpoint session. Skipping...")
+ return
+ tables = []
+ for db in metastore_database:
+ db_path = metastore_path + '/' + db
+ metastore_tables = [(db, tb, db+"."+tb) for tb in os.listdir(db_path)]
+ tables.extend(metastore_tables)
+
+ dbs, tbs, both = zip(*tables)
+ return {'databases' : dbs, "tables": tbs, "name": both}
diff --git a/workspace_mapping_instructions.md b/workspace_mapping_instructions.md
new file mode 100644
index 0000000..8be96a2
--- /dev/null
+++ b/workspace_mapping_instructions.md
@@ -0,0 +1,116 @@
+### Virtual Environment
+Use the requirements file to install all appropriate packages. For example, in Conda, you can install the packages while creating the environment like this:
+
+```
+conda create --name --file requirements.txt
+````
+
+# Workspace Mapping
+## Logs to CSV
+
+Run the **convert_all_logs.py** file. This will result in a directory _/csv_ with all of the necessary csvs and excel document _asset_mapping.xslx_ that contains all of the csvs as spreadsheets. These csvs will used to manually allocate certain resources to each workspace.
+
+```
+python convert_all_logs.py
+```
+
+Please keep insert all of the files directly into the migrate folder. Do not put it in the logs directory or a specific checkpoint. The migrate should look like this:
+
+```bash
+├── logs
+│ ├── clusters.log
+│ ├── groups
+│ │ ├── ...
+│ ├── instance_pools.log
+│ ├── instance_profiles.log
+│ ├── jobs.log
+│ ├── libraries.log
+│ ├── secret_scopes
+│ │ ├── ...
+│ ├── users.log
+├── convert_all_logs.py
+└── utils
+```
+
+After running the scripts, you should see a _csv_ directory with the csvs.
+
+```bash
+├── csv
+│ ├── users.csv
+│ ├── global_shared_logs.csv
+│ ├── instance_pools.csv
+│ ├── libraries.csv
+│ ├── jobs.csv
+│ ├── secret_scopes.csv
+│ ├── clusters.csv
+│ ├── instance_profiles.csv
+│ ├── mounts.csv
+│ ├── metastore.csv
+│ ├── groups.csv
+│ ├── shared_logs.csv
+```
+## Manual Resource Mapping
+
+Directly using the csvs, allocate where each resource will be moved. Add the workspace to each csv under a column titled **workspace**.
+
+## Mapping
+
+Run the map.py file. Please note the session name (this is the name of the directory below the logs directory that contains the logs) and enter it using the parameter _checkpoint_. List all of the workspaces with a space using the parameter _workspace_. This script will take in the csvs and split the logs to each workspace, located in a different directory.
+
+```
+python map.py --checkpoint [SESSION NAME] --workspace [WORKSPACE1 WORKSPACE2 ..]
+```
+
+This assumes that the folder /logs is located in the same directory as map.py. Please do not change headings in the csvs as these headings are referenced in the mapping.
+
+This is what the directory should look like:
+
+```bash
+├── csv
+│ ├── users.csv
+│ ├── global_shared_logs.csv
+│ ├── instance_pools.csv
+│ ├── libraries.csv
+│ ├── jobs.csv
+│ ├── secret_scopes.csv
+│ ├── clusters.csv
+│ ├── instance_profiles.csv
+│ ├── mounts.csv
+│ ├── metastore.csv
+│ ├── groups.csv
+│ ├── shared_logs.csv
+├── logs
+│ ├── [session name]
+│ │ ├── users.log
+│ │ ├── clusters.log
+│ │ ├── user_dirs.log
+│ │ ├── ...
+├── map.py
+├── utils
+│ ├── create_workspace.py
+│ ├── split_logs.py
+
+```
+
+After running the map.py file, your directory should look like this. Each workspace should have their own unique session name (whatever the session name was concatenated with the workspace name). This should allow you to import the logs directly using that unique session name.
+
+```bash
+├── csv
+│ ├── ...
+├── map.py
+├── utils
+├── logs
+│ ├── [session name]_workspace1
+│ │ ├── users.log
+│ │ ├── clusters.log
+│ │ ├── user_dirs.log
+│ │ ├── ...
+│ ├── [session name]_workspace2
+│ │ ├── users.log
+│ │ ├── clusters.log
+│ │ ├── user_dirs.log
+│ │ ├── ...
+│ ├── ...
+└── ...
+
+```