Release v0.1.1 (#261)

nfx · web-flow · commit c3173eb5b22d · 2023-09-21T22:34:03.000+02:00
* Added batched iteration for `INSERT INTO` queries in `StatementExecutionBackend` with default `max_records_per_batch=1000` ([#237](#237)). * Added crawler for mount points ([#209](#209)). * Added crawlers for compatibility of jobs and clusters, along with basic recommendations for external locations ([#244](#244)). * Added safe return on grants ([#246](#246)). * Added ability to specify empty group filter in the installer script ([#216](#216)) ([#217](#217)). * Added ability to install application by multiple different users on the same workspace ([#235](#235)). * Added dashboard creation on installation and a requirement for `warehouse_id` in config, so that the assessment dashboards are refreshed automatically after job runs ([#214](#214)). * Added reliance on rate limiting from Databricks SDK for listing workspace ([#258](#258)). * Fixed errors in corner cases where Azure Service Principal Credentials were not available in Spark context ([#254](#254)). * Fixed `DESCRIBE TABLE` throwing errors when listing Legacy Table ACLs ([#238](#238)). * Fixed `file already exists` error in the installer script ([#219](#219)) ([#222](#222)). * Fixed `guess_external_locations` failure with `AttributeError: as_dict` and added an integration test ([#259](#259)). * Fixed error handling edge cases in `crawl_tables` task ([#243](#243)) ([#251](#251)). * Fixed `crawl_permissions` task failure on folder names containing a forward slash ([#234](#234)). * Improved `README` notebook documentation ([#260](#260), [#228](#228), [#252](#252), [#223](#223), [#225](#225)). * Removed redundant `.python-version` file ([#221](#221)). * Removed discovery of account groups from `crawl_permissions` task ([#240](#240)). * Updated databricks-sdk requirement from ~=0.8.0 to ~=0.9.0 ([#245](#245)).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,26 @@
 # Version changelog
 
+## 0.1.1
+
+* Added batched iteration for `INSERT INTO` queries in `StatementExecutionBackend` with default `max_records_per_batch=1000` ([#237](https://github.com/databricks/ucx/pull/237)).
+* Added crawler for mount points ([#209](https://github.com/databricks/ucx/pull/209)).
+* Added crawlers for compatibility of jobs and clusters, along with basic recommendations for external locations ([#244](https://github.com/databricks/ucx/pull/244)).
+* Added safe return on grants ([#246](https://github.com/databricks/ucx/pull/246)).
+* Added ability to specify empty group filter in the installer script ([#216](https://github.com/databricks/ucx/pull/216)) ([#217](https://github.com/databricks/ucx/pull/217)).
+* Added ability to install application by multiple different users on the same workspace ([#235](https://github.com/databricks/ucx/pull/235)).
+* Added dashboard creation on installation and a requirement for `warehouse_id` in config, so that the assessment dashboards are refreshed automatically after job runs ([#214](https://github.com/databricks/ucx/pull/214)).
+* Added reliance on rate limiting from Databricks SDK for listing workspace ([#258](https://github.com/databricks/ucx/pull/258)).
+* Fixed errors in corner cases where Azure Service Principal Credentials were not available in Spark context ([#254](https://github.com/databricks/ucx/pull/254)).
+* Fixed `DESCRIBE TABLE` throwing errors when listing Legacy Table ACLs ([#238](https://github.com/databricks/ucx/pull/238)).
+* Fixed `file already exists` error in the installer script ([#219](https://github.com/databricks/ucx/pull/219)) ([#222](https://github.com/databricks/ucx/pull/222)).
+* Fixed `guess_external_locations` failure with `AttributeError: as_dict` and added an integration test ([#259](https://github.com/databricks/ucx/pull/259)).
+* Fixed error handling edge cases in `crawl_tables` task ([#243](https://github.com/databricks/ucx/pull/243)) ([#251](https://github.com/databricks/ucx/pull/251)).
+* Fixed `crawl_permissions` task failure on folder names containing a forward slash ([#234](https://github.com/databricks/ucx/pull/234)).
+* Improved `README` notebook documentation ([#260](https://github.com/databricks/ucx/pull/260), [#228](https://github.com/databricks/ucx/pull/228), [#252](https://github.com/databricks/ucx/pull/252), [#223](https://github.com/databricks/ucx/pull/223), [#225](https://github.com/databricks/ucx/pull/225)).
+* Removed redundant `.python-version` file ([#221](https://github.com/databricks/ucx/pull/221)).
+* Removed discovery of account groups from `crawl_permissions` task ([#240](https://github.com/databricks/ucx/pull/240)).
+* Updated databricks-sdk requirement from ~=0.8.0 to ~=0.9.0 ([#245](https://github.com/databricks/ucx/pull/245)).
+
 ## 0.1.0
 
 Features
diff --git a/src/databricks/labs/ucx/__about__.py b/src/databricks/labs/ucx/__about__.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/src/databricks/labs/ucx/assessment/commands/create_table_inventory.scala b/src/databricks/labs/ucx/assessment/commands/create_table_inventory.scala
diff --git a/src/databricks/labs/ucx/assessment/crawlers.py b/src/databricks/labs/ucx/assessment/crawlers.py
@@ -148,7 +148,3 @@ def snapshot(self) -> list[ClusterInfo]:
     def _try_fetch(self) -> list[ClusterInfo]:
         for row in self._fetch(f"SELECT * FROM {self._schema}.{self._table}"):
             yield JobInfo(*row)
-
-
-if __name__ == "__main__":
-    print("Databricks UC Assessment")
diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py
@@ -86,7 +86,7 @@ def _row_to_sql(row, fields):
             elif f.type == bool:
                 data.append("TRUE" if value else "FALSE")
             elif f.type == str:
-                value = value.replace("'", "''")
+                value = str(value).replace("'", "''")
                 data.append(f"'{value}'")
             elif f.type == int:
                 data.append(f"{value}")
diff --git a/src/databricks/labs/ucx/hive_metastore/data_objects.py b/src/databricks/labs/ucx/hive_metastore/data_objects.py
@@ -25,7 +25,7 @@ def _external_locations(self, tables: list[Row], mounts) -> list[ExternalLocatio
         min_slash = 2
         external_locations: list[ExternalLocation] = []
         for table in tables:
-            location = table.as_dict()["location"]
+            location = table.location
             if location is not None and len(location) > 0:
                 if location.startswith("dbfs:/mnt"):
                     for mount in mounts:
diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py
@@ -234,6 +234,6 @@ def _grants(
                     any_file=any_file,
                     anonymous_function=anonymous_function,
                 )
-        except RuntimeError as e:
+        except Exception as e:
             logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")
             return []
diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py
@@ -140,6 +140,6 @@ def _describe(self, catalog: str, database: str, table: str) -> Table | None:
                 location=describe.get("Location", None),
                 view_text=describe.get("View Text", None),
             )
-        except RuntimeError as e:
+        except Exception as e:
             logger.error(f"Couldn't fetch information for table {full_name} : {e}")
             return None
diff --git a/src/databricks/labs/ucx/mixins/sql.py b/src/databricks/labs/ucx/mixins/sql.py
@@ -36,17 +36,29 @@ def __repr__(self):
 
 
 class Row(tuple):
+    # Python SDK convention
     def as_dict(self) -> dict[str, any]:
         return dict(zip(self.__columns__, self, strict=True))
 
-    def __getattr__(self, col):
-        idx = self.__columns__.index(col)
-        return self[idx]
+    # PySpark convention
+    def __contains__(self, item):
+        return item in self.__columns__
 
     def __getitem__(self, col):
+        if isinstance(col, int | slice):
+            return super().__getitem__(col)
         # if columns are named `2 + 2`, for example
         return self.__getattr__(col)
 
+    def __getattr__(self, col):
+        try:
+            idx = self.__columns__.index(col)
+            return self[idx]
+        except IndexError:
+            raise AttributeError(col)  # noqa: B904
+        except ValueError:
+            raise AttributeError(col)  # noqa: B904
+
     def __repr__(self):
         return f"Row({', '.join(f'{k}={v}' for (k, v) in zip(self.__columns__, self, strict=True))})"
 
diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py
@@ -34,7 +34,12 @@ def crawl_tables(_: MigrationConfig):
     readily accessible point of reference for users, data engineers, and administrators."""
 
 
-@task("assessment", depends_on=[crawl_tables], job_cluster="tacl")
+@task("assessment", job_cluster="tacl")
+def setup_tacl(_: MigrationConfig):
+    """(Optimization) Starts tacl job cluster in parallel to crawling tables"""
+
+
+@task("assessment", depends_on=[crawl_tables, setup_tacl], job_cluster="tacl")
 def crawl_grants(cfg: MigrationConfig):
     """During this process, our methodology is purposefully designed to systematically scan and retrieve ACLs
     (Access Control Lists) associated with Legacy Tables from the Hive Metastore. These ACLs encompass comprehensive
diff --git a/tests/integration/hive_metastore/test_external_locations.py b/tests/integration/hive_metastore/test_external_locations.py
@@ -3,7 +3,7 @@
 
 from databricks.labs.ucx.framework.crawlers import StatementExecutionBackend
 from databricks.labs.ucx.hive_metastore.data_objects import ExternalLocationCrawler
-from databricks.labs.ucx.hive_metastore.list_mounts import Mount
+from databricks.labs.ucx.hive_metastore.mounts import Mount
 from databricks.labs.ucx.hive_metastore.tables import Table
 
 logger = logging.getLogger(__name__)
diff --git a/tests/integration/hive_metastore/test_mounts.py b/tests/integration/hive_metastore/test_mounts.py
@@ -1,6 +1,6 @@
 import pytest
 
-from databricks.labs.ucx.hive_metastore.list_mounts import Mount
+from databricks.labs.ucx.hive_metastore.mounts import Mount
 from databricks.labs.ucx.mixins.compute import CommandExecutor
 
 
diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py
@@ -8,7 +8,7 @@
 from databricks.sdk.service.iam import PermissionLevel
 from databricks.sdk.service.workspace import ImportFormat
 
-from databricks.labs.ucx.config import GroupsConfig, MigrationConfig, TaclConfig
+from databricks.labs.ucx.config import GroupsConfig, MigrationConfig
 from databricks.labs.ucx.hive_metastore.grants import Grant
 from databricks.labs.ucx.hive_metastore.tables import Table
 from databricks.labs.ucx.install import Installer
@@ -36,7 +36,7 @@ def test_assessment_job_with_no_inventory_database(
 
     schema_a = make_schema()
     schema_b = make_schema()
-    schema_c = make_schema()
+    make_schema()
     table_a = make_table(schema=schema_a)
     table_b = make_table(schema=schema_b)
 
@@ -67,7 +67,6 @@ def test_assessment_job_with_no_inventory_database(
         inventory_database=f"ucx_{make_random(4)}",
         instance_pool_id=os.environ["TEST_INSTANCE_POOL_ID"],
         groups=GroupsConfig(selected=[ws_group_a.display_name, ws_group_b.display_name, ws_group_c.display_name]),
-        tacl=TaclConfig(databases=[schema_a.split(".")[-1], schema_b.split(".")[-1], schema_c.split(".")[-1]]),
         log_level="DEBUG",
     )
     install._write_config()
diff --git a/tests/integration/workspace_access/test_workspace_access.py b/tests/integration/workspace_access/test_workspace_access.py
@@ -6,12 +6,7 @@
 from databricks.sdk.service import workspace
 from databricks.sdk.service.iam import PermissionLevel
 
-from databricks.labs.ucx.config import (
-    ConnectConfig,
-    GroupsConfig,
-    MigrationConfig,
-    TaclConfig,
-)
+from databricks.labs.ucx.config import ConnectConfig, GroupsConfig, MigrationConfig
 from databricks.labs.ucx.workspace_access import GroupMigrationToolkit
 
 logger = logging.getLogger(__name__)
@@ -162,7 +157,6 @@ def test_workspace_access_e2e(
         inventory_database=make_schema(catalog="hive_metastore").split(".")[-1],
         groups=GroupsConfig(selected=[ws_group.display_name]),
         workspace_start_path=directory,
-        tacl=TaclConfig(auto=True),
         log_level="DEBUG",
         num_threads=8,
     )

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.1.0"`
	`1`	`+__version__ = "0.1.1"`
Original file line number	Diff line number	Diff line change
`@@ -234,6 +234,6 @@ def _grants(`
`234`	`234`	`any_file=any_file,`
`235`	`235`	`anonymous_function=anonymous_function,`
`236`	`236`	`)`
`237`		`- except RuntimeError as e:`
	`237`	`+ except Exception as e:`
`238`	`238`	`logger.error(f"Couldn't fetch grants for object {on_type} {key}: {e}")`
`239`	`239`	`return []`
Original file line number	Diff line number	Diff line change
`@@ -140,6 +140,6 @@ def _describe(self, catalog: str, database: str, table: str) -> Table \| None:`
`140`	`140`	`location=describe.get("Location", None),`
`141`	`141`	`view_text=describe.get("View Text", None),`
`142`	`142`	`)`
`143`		`- except RuntimeError as e:`
	`143`	`+ except Exception as e:`
`144`	`144`	`logger.error(f"Couldn't fetch information for table {full_name} : {e}")`
`145`	`145`	`return None`