Skip to content

Commit 1a5357b

Browse files
author
dougspadotto
committed
added external location to information_schema.tables and recovered from it
1 parent 2fcf057 commit 1a5357b

File tree

2 files changed

+60
-6
lines changed

2 files changed

+60
-6
lines changed

metastore_export_import/01_backup_catalog.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,70 @@
1111

1212
# COMMAND ----------
1313

14+
from delta.tables import *
15+
16+
# COMMAND ----------
17+
1418
dbutils.widgets.removeAll()
1519
dbutils.widgets.text("storageLocation", "/mnt/externallocation", "Storage location for copy")
1620
dbutils.widgets.text("catalogName", "system", "information_schema catalog")
21+
dbutils.widgets.dropdown("getExternalLocations", "True", ["True", "False"])
1722

1823
# COMMAND ----------
1924

2025
storage_location = dbutils.widgets.get("storageLocation")
2126
catalog_name = dbutils.widgets.get("catalogName")
27+
get_external_location = dbutils.widgets.get("getExternalLocations")
2228

2329
table_list = spark.catalog.listTables(f"{catalog_name}.information_schema")
2430

2531
# COMMAND ----------
2632

2733
for table in table_list:
28-
df = spark.sql(f"SELECT * FROM {table.catalog}.information_schema.{table.name}")
29-
df.write.format("delta").mode("overwrite").save(f"{storage_location}/{table.name}")
34+
info_schema_table_df = spark.sql(f"SELECT * FROM {table.catalog}.information_schema.{table.name}")
35+
info_schema_table_df.write.format("delta").mode("overwrite").save(f"{storage_location}/{table.name}")
36+
37+
# COMMAND ----------
38+
39+
# MAGIC %md
40+
# MAGIC ##Optional step
41+
# MAGIC Get table locations from running DESCRIBE EXTENDED on each table on information_schema.tables
42+
43+
# COMMAND ----------
44+
45+
if get_external_location:
46+
table_location_columns = ["table_catalog","table_schema","table_name","table_location"]
47+
table_location_storage = "external_table_locations"
48+
location_list = []
49+
50+
#Need to filter out Unity Catalog data source that counts as external
51+
describe_table_list = spark.read.table(f"{catalog_name}.information_schema.tables").filter("table_type=='EXTERNAL' AND data_source_format <> 'UNITY_CATALOG'")
52+
53+
for d_table in describe_table_list.collect():
54+
d_location = spark.sql(f"DESCRIBE EXTENDED {d_table.table_catalog}.{d_table.table_schema}.{d_table.table_name}").filter("col_name = 'Location'").select("data_type").head()[0]
55+
location_list.append([d_table.table_catalog, d_table.table_schema, d_table.table_name, d_location])
56+
57+
location_df = spark.createDataFrame(data=location_list, schema = table_location_columns)
58+
59+
#merge with information_schema.tables and save external locations to storage_sub_directory column (that as of 03/09 only holds Managed table information)
60+
table_df = DeltaTable.forPath(spark, f"{storage_location}/tables")
61+
#table_df = spark.sql(f"SELECT * FROM {table.catalog}.information_schema.tables")
62+
table_df.alias('tables') \
63+
.merge(
64+
location_df.alias('locations'),
65+
'tables.table_catalog = locations.table_catalog and tables.table_schema = locations.table_schema and tables.table_name = locations.table_name'
66+
) \
67+
.whenMatchedUpdate(set =
68+
{
69+
"storage_sub_directory": "locations.table_location"
70+
}
71+
) \
72+
.execute()
73+
74+
display(table_df)
75+
#or create a separate table only for this
76+
#(location_df
77+
#.write
78+
#.mode("overwrite")
79+
#.format("delta")
80+
#.save(f"{storage_location}/{table_location_storage}"))

metastore_export_import/02_recreate_catalog.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88
# MAGIC
99
# MAGIC Assumptions:
1010
# MAGIC - The storage credential(s) and external location(s) of the parent external location needs to be created on the target UC beforehand.
11-
# MAGIC - The external location for all schemas is the same, formed of ```<storage location root>/<schema name>/<table name>```
11+
# MAGIC - The external location is taken from the overwritten storage_sub_directory column on information_schema.tables (previously it was formed by ```<storage location root>/<schema name>/<table name>```)
1212
# MAGIC - All tables are Delta
1313

1414
# COMMAND ----------
1515

1616
dbutils.widgets.removeAll()
1717
dbutils.widgets.text("storageLocation", "/mnt/externallocation", "Storage with source catalog info")
18-
dbutils.widgets.text("catalogName", "system", "information_schema catalog")
18+
dbutils.widgets.text("catalogName", "system_backup", "information_schema catalog")
1919
dbutils.widgets.text("rootExternalStorage", "abfss://[email protected]/root/", "Root of external tables' path")
2020

2121
# COMMAND ----------
@@ -69,7 +69,6 @@ def return_schema(df):
6969

7070
from pyspark.sql.functions import col, when, collect_list, upper
7171

72-
7372
#Get only user schemas
7473
schemas_df = spark.read.format("delta").load(f"{storage_location}/schemata").filter("schema_name<>'information_schema'")
7574

@@ -97,7 +96,11 @@ def return_schema(df):
9796
columns = return_schema(columns_df)
9897

9998
#Create Table
100-
spark.sql(f"CREATE OR REPLACE TABLE {catalog_name}.{table.table_schema}.{table.table_name}({columns}) COMMENT '{table.comment}' LOCATION '{root_externalstorage}{table.table_schema}/{table.table_name}'")
99+
#Hard-coded path
100+
#spark.sql(f"CREATE OR REPLACE TABLE {catalog_name}.{table.table_schema}.{table.table_name}({columns}) COMMENT '{table.comment}' LOCATION '{root_externalstorage}{table.table_schema}/{table.table_name}'")
101+
102+
#Extracted path
103+
spark.sql(f"CREATE OR REPLACE TABLE {catalog_name}.{table.table_schema}.{table.table_name}({columns}) COMMENT '{table.comment}' LOCATION '{table.storage_sub_directory}'")
101104
spark.sql(f"ALTER TABLE {catalog_name}.{table.table_schema}.{table.table_name} SET OWNER to `{table.table_owner}`")
102105

103106
# COMMAND ----------

0 commit comments

Comments
 (0)