databrickslabs · jsparhamii · Dec 13, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,6 @@ dist/
 .tox/
 databricks_migration_tool.egg-info
 migrate.iml
+export_dir/
+unversioned/
+
diff --git a/Root Hive Migration.dbc b/Root Hive Migration.dbc
diff --git a/Workspace Sizing Notebook.html b/Workspace Sizing Notebook.html
diff --git a/WorkspaceClient_modified.py b/WorkspaceClient_modified.py
diff --git a/convert_all_logs.py b/convert_all_logs.py
@@ -0,0 +1,117 @@
+###################### importing other scripts ##############################################
+from utils import to_csv as util
+from utils import create_asset_mapping_spreadsheet as create_spreadsheet
+############################################################################################
+import argparse
+import os
+
+def main(checkpoint, destination="csv"):
+    # where you want the csv files to be located
+    # make the csv directory if its not there
+    if destination not in os.listdir():
+        print(f"Creating {destination}...")
+        os.mkdir(f"./{destination}")
+
+    # users
+    users_data = util.read_log("users.log", checkpoint)
+    if users_data == 1: 
+        print("users.log not found in checkpoint session")
+    else:
+        users_df = util.create_users(users_data)
+        util.save_to_csv(users_df, "users.csv", destination)
+
+    # instance profiles
+    ip_data = util.read_log("instance_profiles.log", checkpoint)
+    if ip_data == 1: # file not found
+        print("instance_profiles.log not found in checkpoint session. Skipping...")
+    else: 
+        ip_df = util.create_instance_profiles(ip_data)
+        util.save_to_csv(ip_df, "instance_profiles.csv", destination)
+
+    # instance pools
+    ipo_data = util.read_log("instance_pools.log", checkpoint)
+    if ipo_data == 1: #file not found
+        print("instance_pools.log not found in checkpoint session. Skipping...")
+    else:
+        ipo_df = util.create_instance_pools(ipo_data)
+        util.save_to_csv(ipo_df, "instance_pools.csv", destination)
+
+    # groups
+    groups_df = util.create_groups("groups", checkpoint)
+    if groups_df == 1: 
+        print("groups.log not found in checkpoint session. Skipping...")
+    util.save_to_csv(groups_df, "groups.csv", destination)
+
+    # clusters
+    clusters_data = util.read_log("clusters.log", checkpoint)
+    if clusters_data ==1 : #file not found 
+        print("clusters.log not found in checkpoint session. Skipping... ")
+    else: 
+        clusters_df = util.create_clusters(clusters_data)
+        util.save_to_csv(clusters_df, "clusters.csv", destination)
+
+    # cluster policies 
+    cluster_policies_data = util.read_log('cluster_policies.log', checkpoint)
+    if cluster_policies_data == 1: #file not found
+        print("cluster_policies.log not found in checkpoint session. Skipping... ")
+    else:
+        clusters_policies_df = util.create_cluster_policies(cluster_policies_data)
+        util.save_to_csv(clusters_policies_df, "cluster_policies.csv", destination)
+
+    # job
+    jobs_data = util.read_log('jobs.log', checkpoint)  
+    if jobs_data == 1: #file not found
+        print("jobs.log not found in checkpoint session. Skipping... ")
+    else:
+        jobs_acls = util.read_log('acl_jobs.log', checkpoint)
+        jobs_df = util.create_jobs(jobs_data, jobs_acls)
+        util.save_to_csv(jobs_df, "jobs.csv", destination)
+
+    # shared
+    shared_df = util.create_shared_logs("artifacts/Shared", checkpoint)
+    if shared_df == 1: #file not found
+        print("Shared notebooks not found in checkpoint session. Skipping... ")
+    util.save_to_csv(shared_df, 'global_shared_logs.csv', destination)
+
+    # other artificats
+    other_df = util.create_other_artifacts("artifacts", checkpoint)
+    if other_df == 1: #file not found
+        print("Global artifacts not found in checkpoint session. Skipping... ")
+    util.save_to_csv(other_df, "global_logs.csv", destination)
+
+    # libraries
+    libraries_data = util.read_log("libraries.log", checkpoint)
+    if libraries_data == 1: # not found
+        print("libraries.log not found in checkpoint session. Skipping...")
+    else: 
+        libraries_df = util.create_libraries(libraries_data)
+        util.save_to_csv(libraries_df, "libraries.csv", destination)
+
+    # secret scopes
+    scopes_df = util.create_scopes("secret_scopes", checkpoint)
+    if scopes_df == 1:
+        print("secret_scopes.log not found in checkpoint session. Skipping...")
+    util.save_to_csv(scopes_df, "secret_scopes.csv", destination)
+
+    # just databases
+    databases_df = util.create_database(checkpoint, directory_name = 'metastore')
+    if databases_df == 1:
+        print("metastore.log not found in checkpoint session. Skipping...")
+    util.save_to_csv(databases_df, "databases.csv", destination)
+
+    # entire metastore
+    metastore_df = util.create_metastore(checkpoint, directory_name = 'metastore')
+    if metastore_df == 1:
+        print("metastore.log not found in checkpoint session. Skipping...")
+    util.save_to_csv(metastore_df, "metastore.csv", destination)
+
+    create_spreadsheet.csv_to_excel(f"./{destination}")
+    print("Successfully created spreadsheet asset_mapping.xlsx. ")
+
+if __name__ == "__main__":
+    all_args = argparse.ArgumentParser()
+    all_args.add_argument("--checkpoint", "--session", dest="checkpoint", default="", help="set if you are using a checkpoint during export")
+    all_args.add_argument("--destination", dest="destination", default="csv", help="destination of converted logs (default: /csv)")
+
+    args = all_args.parse_args()
+    main(args.checkpoint, args.destination)
diff --git a/data/aws_cluster.json b/data/aws_cluster.json
@@ -1,6 +1,6 @@
 {
   "num_workers": 1,
-  "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration",
   "spark_version": "10.4.x-scala2.12",
   "aws_attributes": {
       "first_on_demand": 1,

diff --git a/data/aws_cluster_hipaa.json b/data/aws_cluster_hipaa.json
@@ -1,6 +1,6 @@
 {
   "num_workers": 1,
-  "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration",
   "spark_version": "10.4.x-scala2.12",
   "aws_attributes": {
       "first_on_demand": 1,

diff --git a/data/aws_cluster_table_acls.json b/data/aws_cluster_table_acls.json
@@ -1,13 +1,9 @@
 {
   "num_workers": 1,
-  "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration_Table_ACLs",
   "spark_version": "10.4.x-scala2.12",
   "spark_conf": {
-    "spark.databricks.cluster.profile": "serverless",
-    "spark.databricks.repl.allowedLanguages": "python,sql",
-    "spark.databricks.acl.dfAclsEnabled": "true",
-    "spark.sql.hive.metastore.version": "1.2.1",
-    "spark.sql.hive.metastore.jars": "maven"
+    "spark.databricks.acl.dfAclsEnabled": "true"
   },
   "aws_attributes": {
     "first_on_demand": 1,

diff --git a/data/aws_cluster_table_acls_hipaa.json b/data/aws_cluster_table_acls_hipaa.json
@@ -1,6 +1,6 @@
 {
   "num_workers": 1,
-  "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration_Table_ACLs",
   "spark_version": "10.4.x-scala2.12",
   "spark_conf": {
     "spark.databricks.cluster.profile": "serverless",

diff --git a/data/azure_cluster.json b/data/azure_cluster.json
@@ -1,6 +1,6 @@
 {
     "num_workers": 1,
-    "cluster_name": "API_Metastore_Work_Leave_Me_Alone",
+    "cluster_name": "E2_Migration",
     "spark_version": "10.4.x-scala2.12",
     "spark_conf": {},
     "node_type_id": "Standard_D8_v3",

diff --git a/data/azure_cluster_table_acls.json b/data/azure_cluster_table_acls.json
@@ -1,6 +1,6 @@
 {
   "num_workers": 1,
-  "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration_Table_ACLs",
   "spark_version": "10.4.x-scala2.12",
   "spark_conf": {
     "spark.databricks.cluster.profile": "serverless",

diff --git a/data/default_jobs_cluster_aws.json b/data/default_jobs_cluster_aws.json
@@ -1,6 +1,6 @@
 {
         "num_workers": 8,
-        "spark_version": "7.3.x-scala2.12",
+        "spark_version": "14.3.x-scala2.12",
         "node_type_id": "i3.xlarge",
         "spark_env_vars": {
             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"

diff --git a/data/default_jobs_cluster_aws_hipaa.json b/data/default_jobs_cluster_aws_hipaa.json
@@ -1,6 +1,6 @@
 {
         "num_workers": 8,
-        "spark_version": "7.3.x-scala2.12",
+        "spark_version": "14.3.x-scala2.12",
         "node_type_id": "i4i.xlarge",
         "spark_env_vars": {
             "PYSPARK_PYTHON": "/databricks/python3/bin/python3"

diff --git a/data/gcp_cluster.json b/data/gcp_cluster.json
@@ -1,6 +1,6 @@
 {
   "num_workers": 1,
-  "cluster_name": "Workspace_Migration_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration",
   "spark_version": "10.4.x-scala2.12",
   "gcp_attributes": {
     "first_on_demand": 1

diff --git a/data/gcp_cluster_table_acls.json b/data/gcp_cluster_table_acls.json
@@ -1,5 +1,5 @@
 {
-  "cluster_name": "API_Table_ACL_Work_Leave_Me_Alone",
+  "cluster_name": "E2_Migration_Table_ACLs",
   "spark_version": "10.4.x-scala2.12",
   "spark_conf": {
     "spark.databricks.cluster.profile": "serverless",