@@ -134,17 +134,7 @@ def create_grants_df(database_name: str,object_type: str, object_key: str):
134134 return grants_df
135135
136136
137- def create_table_ACLSs_df_for_databases (database_names : List [str ]):
138-
139- # TODO check Catalog heuristic:
140- # if all databases are exported, we include the Catalog grants as well
141- #. if only a few databases are exported: we exclude the Catalog
142- if database_names is None or database_names == '' :
143- database_names = get_database_names ()
144- include_catalog = True
145- else :
146- include_catalog = False
147-
137+ def create_table_ACLSs_df_for_databases (database_names : List [str ], include_catalog : bool ):
148138 num_databases_processed = len (database_names )
149139 num_tables_or_views_processed = 0
150140
@@ -201,35 +191,53 @@ def create_table_ACLSs_df_for_databases(database_names: List[str]):
201191# COMMAND ----------
202192
203193# DBTITLE 1,Run Export
194+ def chunks (lst , n ):
195+ """Yield successive n-sized chunks from lst."""
196+ for i in range (0 , len (lst ), n ):
197+ yield lst [i :i + n ]
198+
199+
204200databases_raw = dbutils .widgets .get ("Databases" )
205201output_path = dbutils .widgets .get ("OutputPath" )
206202
207203if databases_raw .rstrip () == '' :
208- databases = None
204+ # TODO check Catalog heuristic:
205+ # if all databases are exported, we include the Catalog grants as well
206+ databases = get_database_names ()
207+ include_catalog = True
209208 print (f"Exporting all databases" )
210209else :
210+ #. if only a few databases are exported: we exclude the Catalog
211211 databases = [x .rstrip ().lstrip () for x in databases_raw .split ("," )]
212+ include_catalog = False
212213 print (f"Exporting the following databases: { databases } " )
213214
215+ counter = 1
216+ for databases_chunks in chunks (databases , 1 ):
217+ table_ACLs_df , num_databases_processed , num_tables_or_views_processed = create_table_ACLSs_df_for_databases (
218+ databases_chunks , include_catalog
219+ )
220+
221+ print (
222+ f"{ datetime .datetime .now ()} total number processed chunk { counter } : databases: { num_databases_processed } , tables or views: { num_tables_or_views_processed } " )
223+ print (f"{ datetime .datetime .now ()} writing table ACLs to { output_path } " )
224+
225+ # with table ACLS active, I direct write to DBFS is not allowed, so we store
226+ # the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
227+ (
228+ table_ACLs_df
229+ # .coalesce(1)
230+ .selectExpr ("Database" , "Principal" , "ActionTypes" , "ObjectType" , "ObjectKey" , "ExportTimestamp" )
231+ # .sort("Database","Principal","ObjectType","ObjectKey")
232+ .write
233+ .format ("JSON" )
234+ .option ("compression" , "gzip" )
235+ .mode ("append" if counter > 1 else "overwrite" )
236+ .save (output_path )
237+ )
214238
215- table_ACLs_df ,num_databases_processed , num_tables_or_views_processed = create_table_ACLSs_df_for_databases (databases )
216-
217- print (f"{ datetime .datetime .now ()} total number processed: databases: { num_databases_processed } , tables or views: { num_tables_or_views_processed } " )
218- print (f"{ datetime .datetime .now ()} writing table ACLs to { output_path } " )
219-
220- # with table ACLS active, I direct write to DBFS is not allowed, so we store
221- # the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
222- (
223- table_ACLs_df
224- .coalesce (1 )
225- .selectExpr ("Database" ,"Principal" ,"ActionTypes" ,"ObjectType" ,"ObjectKey" ,"ExportTimestamp" )
226- .sort ("Database" ,"Principal" ,"ObjectType" ,"ObjectKey" )
227- .write
228- .format ("JSON" )
229- .option ("compression" ,"gzip" )
230- .mode ("overwrite" )
231- .save (output_path )
232- )
239+ counter += 1
240+ include_catalog = False
233241
234242
235243# COMMAND ----------
0 commit comments