Merge remote-tracking branch 'upstream/master'

lorenzorubi-db · lorenzorubi-db · commit 45836703a2a6 · 2023-01-18T14:02:05.000+01:00
diff --git a/data/notebooks/Export_Table_ACLs.py b/data/notebooks/Export_Table_ACLs.py
@@ -134,17 +134,7 @@ def create_grants_df(database_name: str,object_type: str, object_key: str):
   return grants_df
   
 
-def create_table_ACLSs_df_for_databases(database_names: List[str]):
-  
-  # TODO check Catalog heuristic:
-  #  if all databases are exported, we include the Catalog grants as well
-  #. if only a few databases are exported: we exclude the Catalog
-  if database_names is None or database_names == '':
-    database_names = get_database_names()
-    include_catalog = True
-  else:
-    include_catalog = False
-    
+def create_table_ACLSs_df_for_databases(database_names: List[str], include_catalog: bool):
   num_databases_processed = len(database_names)
   num_tables_or_views_processed = 0
 
@@ -201,35 +191,53 @@ def create_table_ACLSs_df_for_databases(database_names: List[str]):
 # COMMAND ----------
 
 # DBTITLE 1,Run Export
+def chunks(lst, n):
+  """Yield successive n-sized chunks from lst."""
+  for i in range(0, len(lst), n):
+    yield lst[i:i + n]
+
+
 databases_raw = dbutils.widgets.get("Databases")
 output_path = dbutils.widgets.get("OutputPath")
 
 if databases_raw.rstrip() == '':
-  databases = None
+  # TODO check Catalog heuristic:
+  #  if all databases are exported, we include the Catalog grants as well
+  databases = get_database_names()
+  include_catalog = True
   print(f"Exporting all databases")
 else:
+  #. if only a few databases are exported: we exclude the Catalog
   databases = [x.rstrip().lstrip() for x in databases_raw.split(",")]
+  include_catalog = False
   print(f"Exporting the following databases: {databases}")
 
+counter = 1
+for databases_chunks in chunks(databases, 1):
+  table_ACLs_df, num_databases_processed, num_tables_or_views_processed = create_table_ACLSs_df_for_databases(
+    databases_chunks, include_catalog
+  )
+
+  print(
+    f"{datetime.datetime.now()} total number processed chunk {counter}: databases: {num_databases_processed}, tables or views: {num_tables_or_views_processed}")
+  print(f"{datetime.datetime.now()} writing table ACLs to {output_path}")
+
+  # with table ACLS active, I direct write to DBFS is not allowed, so we store
+  # the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
+  (
+    table_ACLs_df
+      # .coalesce(1)
+      .selectExpr("Database", "Principal", "ActionTypes", "ObjectType", "ObjectKey", "ExportTimestamp")
+      # .sort("Database","Principal","ObjectType","ObjectKey")
+      .write
+      .format("JSON")
+      .option("compression", "gzip")
+      .mode("append" if counter > 1 else "overwrite")
+      .save(output_path)
+  )
 
-table_ACLs_df,num_databases_processed, num_tables_or_views_processed = create_table_ACLSs_df_for_databases(databases)
-
-print(f"{datetime.datetime.now()} total number processed: databases: {num_databases_processed}, tables or views: {num_tables_or_views_processed}")
-print(f"{datetime.datetime.now()} writing table ACLs to {output_path}")
-
-# with table ACLS active, I direct write to DBFS is not allowed, so we store
-# the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
-(
-  table_ACLs_df
- .coalesce(1)
- .selectExpr("Database","Principal","ActionTypes","ObjectType","ObjectKey","ExportTimestamp")
- .sort("Database","Principal","ObjectType","ObjectKey")
- .write
- .format("JSON")
- .option("compression","gzip")
- .mode("overwrite")
- .save(output_path)
-)
+  counter += 1
+  include_catalog = False
 
 
 # COMMAND ----------
diff --git a/data/notebooks/Import_Table_ACLs.py b/data/notebooks/Import_Table_ACLs.py
@@ -190,6 +190,12 @@ def execute_sql_statements(sqls):
   l = [ str(o) for o in error_causing_sqls ]
   print("\n".join(l))
 
+# COMMAND ----------
+
+# DBTITLE 1,Nicer error output
+if len(error_causing_sqls) != 0:
+  l = [ {'sql': str(o.get('sql')), 'error': str(o.get('error'))} for o in error_causing_sqls ]
+  display(spark.createDataFrame(l))
 
 # COMMAND ----------
 
diff --git a/dbclient/JobsClient.py b/dbclient/JobsClient.py
@@ -166,6 +166,11 @@ def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_m
             wmconstants.WM_IMPORT, wmconstants.JOB_OBJECT)
 
         def adjust_ids_for_cluster(settings): #job_settings or task_settings
+            """
+            The task setting may have existing_cluster_id/new_cluster/job_cluster_key for cluster settings.
+            The job level setting may have existing_cluster_id/new_cluster for cluster settings.
+            Adjust cluster settings for existing_cluster_id and new_cluster scenario.
+            """
             if 'existing_cluster_id' in settings:
                 old_cid = settings['existing_cluster_id']
                 # set new cluster id for existing cluster attribute
@@ -176,7 +181,7 @@ def adjust_ids_for_cluster(settings): #job_settings or task_settings
                     settings['new_cluster'] = self.get_jobs_default_cluster_conf()
                 else:
                     settings['existing_cluster_id'] = new_cid
-            else:  # new cluster config
+            elif 'new_cluster' in settings:  # new cluster config
                 cluster_conf = settings['new_cluster']
                 if 'policy_id' in cluster_conf:
                     old_policy_id = cluster_conf['policy_id']