Skip to content

Commit d7fd3b6

Browse files
table_acls export databases 1 by 1 to avoid driver OOM
1 parent ce27b1a commit d7fd3b6

File tree

1 file changed

+40
-25
lines changed

1 file changed

+40
-25
lines changed

data/notebooks/Export_Table_ACLs.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -134,16 +134,16 @@ def create_grants_df(database_name: str,object_type: str, object_key: str):
134134
return grants_df
135135

136136

137-
def create_table_ACLSs_df_for_databases(database_names: List[str]):
137+
def create_table_ACLSs_df_for_databases(database_names: List[str], include_catalog: bool):
138138

139139
# TODO check Catalog heuristic:
140140
# if all databases are exported, we include the Catalog grants as well
141141
#. if only a few databases are exported: we exclude the Catalog
142-
if database_names is None or database_names == '':
143-
database_names = get_database_names()
144-
include_catalog = True
145-
else:
146-
include_catalog = False
142+
# if database_names is None or database_names == '':
143+
# database_names = get_database_names()
144+
# include_catalog = True
145+
# else:
146+
# include_catalog = False
147147

148148
num_databases_processed = len(database_names)
149149
num_tables_or_views_processed = 0
@@ -201,35 +201,50 @@ def create_table_ACLSs_df_for_databases(database_names: List[str]):
201201
# COMMAND ----------
202202

203203
# DBTITLE 1,Run Export
204+
def chunks(lst, n):
205+
"""Yield successive n-sized chunks from lst."""
206+
for i in range(0, len(lst), n):
207+
yield lst[i:i + n]
208+
209+
204210
databases_raw = dbutils.widgets.get("Databases")
205211
output_path = dbutils.widgets.get("OutputPath")
206212

207213
if databases_raw.rstrip() == '':
208-
databases = None
214+
databases = get_database_names()
215+
include_catalog = True
209216
print(f"Exporting all databases")
210217
else:
211218
databases = [x.rstrip().lstrip() for x in databases_raw.split(",")]
219+
include_catalog = False
212220
print(f"Exporting the following databases: {databases}")
213221

222+
counter = 1
223+
for databases_chunks in chunks(databases, 1):
224+
table_ACLs_df, num_databases_processed, num_tables_or_views_processed = create_table_ACLSs_df_for_databases(
225+
databases_chunks, include_catalog
226+
)
227+
228+
print(
229+
f"{datetime.datetime.now()} total number processed chunk {counter}: databases: {num_databases_processed}, tables or views: {num_tables_or_views_processed}")
230+
print(f"{datetime.datetime.now()} writing table ACLs to {output_path}")
231+
232+
# with table ACLS active, I direct write to DBFS is not allowed, so we store
233+
# the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
234+
(
235+
table_ACLs_df
236+
# .coalesce(1)
237+
.selectExpr("Database", "Principal", "ActionTypes", "ObjectType", "ObjectKey", "ExportTimestamp")
238+
# .sort("Database","Principal","ObjectType","ObjectKey")
239+
.write
240+
.format("JSON")
241+
.option("compression", "gzip")
242+
.mode("append" if counter > 1 else "overwrite")
243+
.save(output_path)
244+
)
214245

215-
table_ACLs_df,num_databases_processed, num_tables_or_views_processed = create_table_ACLSs_df_for_databases(databases)
216-
217-
print(f"{datetime.datetime.now()} total number processed: databases: {num_databases_processed}, tables or views: {num_tables_or_views_processed}")
218-
print(f"{datetime.datetime.now()} writing table ACLs to {output_path}")
219-
220-
# with table ACLS active, I direct write to DBFS is not allowed, so we store
221-
# the dateframe as a table for single zipped JSON file sorted, for consitent file diffs
222-
(
223-
table_ACLs_df
224-
.coalesce(1)
225-
.selectExpr("Database","Principal","ActionTypes","ObjectType","ObjectKey","ExportTimestamp")
226-
.sort("Database","Principal","ObjectType","ObjectKey")
227-
.write
228-
.format("JSON")
229-
.option("compression","gzip")
230-
.mode("overwrite")
231-
.save(output_path)
232-
)
246+
counter += 1
247+
include_catalog = False
233248

234249

235250
# COMMAND ----------

0 commit comments

Comments
 (0)