Hide non-public methods (#1773)

bishwajit-db · web-flow · commit ac242b6e6f32 · 2024-05-27T14:11:34.000+02:00
Hide non-public methods of recon API
diff --git a/src/databricks/labs/ucx/recon/data_comparator.py b/src/databricks/labs/ucx/recon/data_comparator.py
@@ -13,7 +13,7 @@
 
 
 class StandardDataComparator(DataComparator):
-    DATA_COMPARISON_QUERY_TEMPLATE = """
+    _DATA_COMPARISON_QUERY_TEMPLATE = """
     WITH compare_results AS (
         SELECT 
             CASE 
@@ -73,7 +73,7 @@ def compare_data(
                 source_row_count=source_data_profile.row_count,
                 target_row_count=target_data_profile.row_count,
             )
-        comparison_query = StandardDataComparator.build_data_comparison_query(
+        comparison_query = self._build_data_comparison_query(
             source_data_profile,
             target_data_profile,
         )
@@ -89,7 +89,7 @@ def compare_data(
         )
 
     @classmethod
-    def build_data_comparison_query(
+    def _build_data_comparison_query(
         cls,
         source_data_profile: DataProfilingResult,
         target_data_profile: DataProfilingResult,
@@ -98,7 +98,7 @@ def build_data_comparison_query(
         target_table = target_data_profile.table_metadata.identifier
         source_hash_inputs = _build_data_comparison_hash_inputs(source_data_profile)
         target_hash_inputs = _build_data_comparison_hash_inputs(target_data_profile)
-        comparison_query = StandardDataComparator.DATA_COMPARISON_QUERY_TEMPLATE.format(
+        comparison_query = cls._DATA_COMPARISON_QUERY_TEMPLATE.format(
             source_hash_expr=f"SHA2(CONCAT_WS('|', {', '.join(source_hash_inputs)}), 256)",
             target_hash_expr=f"SHA2(CONCAT_WS('|', {', '.join(target_hash_inputs)}), 256)",
             source_table_fqn=source_table.fqn_escaped,
diff --git a/src/databricks/labs/ucx/recon/metadata_retriever.py b/src/databricks/labs/ucx/recon/metadata_retriever.py
@@ -18,7 +18,7 @@ def get_metadata(self, entity: TableIdentifier) -> TableMetadata:
         Note: This method does not handle exceptions raised during the execution of the SQL query. These exceptions are
         expected to be handled by the caller in a manner appropriate for their context.
         """
-        schema_query = DatabricksTableMetadataRetriever.build_metadata_query(entity)
+        schema_query = self._build_metadata_query(entity)
         query_result: Iterator[Row] = self._sql_backend.fetch(schema_query)
         # The code uses a set comprehension to automatically deduplicate the column metadata entries,
         # Partition information are typically prefixed with a # symbol,
@@ -32,7 +32,7 @@ def get_metadata(self, entity: TableIdentifier) -> TableMetadata:
         return TableMetadata(entity, sorted(columns, key=lambda x: x.name))
 
     @classmethod
-    def build_metadata_query(cls, entity: TableIdentifier) -> str:
+    def _build_metadata_query(cls, entity: TableIdentifier) -> str:
         if entity.catalog == "hive_metastore":
             return f"DESCRIBE TABLE {entity.fqn_escaped}"
 
diff --git a/tests/unit/recon/test_data_comparator.py b/tests/unit/recon/test_data_comparator.py
@@ -1,13 +1,8 @@
-import re
-
 from databricks.labs.lsql.backends import MockBackend
 
 from databricks.labs.ucx.recon.base import (
     TableIdentifier,
     DataComparisonResult,
-    DataProfilingResult,
-    TableMetadata,
-    ColumnMetadata,
 )
 from databricks.labs.ucx.recon.data_comparator import StandardDataComparator
 from databricks.labs.ucx.recon.data_profiler import StandardDataProfiler
@@ -22,10 +17,14 @@ def test_data_comparison(metadata_row_factory, row_count_row_factory, data_comp_
             f"{source.catalog}\\.information_schema\\.columns": metadata_row_factory[
                 ("col1", "int"),
                 ("col2", "string"),
+                ("col3", "array<string>"),
+                ("col4", "struct<a:int,b:int,c:array<string>>"),
             ],
             f"{target.catalog}\\.information_schema\\.columns": metadata_row_factory[
                 ("col1", "int"),
                 ("col2", "string"),
+                ("col3", "array<string>"),
+                ("col4", "struct<a:int,b:int,c:array<string>>"),
             ],
             f"SELECT COUNT\\(\\*\\) as row_count FROM {source.fqn_escaped}": row_count_row_factory[100,],
             f"SELECT COUNT\\(\\*\\) as row_count FROM {target.fqn_escaped}": row_count_row_factory[2,],
@@ -45,64 +44,3 @@ def test_data_comparison(metadata_row_factory, row_count_row_factory, data_comp_
     actual_comparison_result = data_comparator.compare_data(source, target, True)
 
     assert actual_comparison_result == expected_comparison_result
-
-
-def test_prepare_data_comparison_query():
-    source = TableIdentifier("hive_metastore", "db1", "table1")
-    target = TableIdentifier("catalog1", "schema1", "table2")
-
-    source_data_profile = DataProfilingResult(
-        10,
-        TableMetadata(
-            source,
-            [
-                ColumnMetadata("col1", "string"),
-                ColumnMetadata("col2", "array<string>"),
-                ColumnMetadata("col3", "struct<a:int,b:int,c:array<string>>"),
-            ],
-        ),
-    )
-    target_data_profile = DataProfilingResult(
-        10,
-        TableMetadata(
-            target,
-            [
-                ColumnMetadata("col1", "string"),
-                ColumnMetadata("col2", "array<string>"),
-                ColumnMetadata("col3", "struct<a:int,b:int,c:array<string>>"),
-            ],
-        ),
-    )
-
-    actual_query = (
-        StandardDataComparator.build_data_comparison_query(
-            source_data_profile,
-            target_data_profile,
-        )
-        .strip()
-        .lower()
-    )
-
-    source_hash_columns = [
-        "COALESCE(TRIM(col1), '')",
-        "COALESCE(TRIM(TO_JSON(SORT_ARRAY(col2))), '')",
-        "COALESCE(TRIM(TO_JSON(col3)), '')",
-    ]
-    target_hash_columns = [
-        "COALESCE(TRIM(col1), '')",
-        "COALESCE(TRIM(TO_JSON(SORT_ARRAY(col2))), '')",
-        "COALESCE(TRIM(TO_JSON(col3)), '')",
-    ]
-
-    expected_query = (
-        StandardDataComparator.DATA_COMPARISON_QUERY_TEMPLATE.format(
-            source_hash_expr=f"SHA2(CONCAT_WS('|', {', '.join(source_hash_columns)}), 256)",
-            target_hash_expr=f"SHA2(CONCAT_WS('|', {', '.join(target_hash_columns)}), 256)",
-            source_table_fqn="`hive_metastore`.`db1`.`table1`",
-            target_table_fqn="`catalog1`.`schema1`.`table2`",
-        )
-        .strip()
-        .lower()
-    )
-
-    assert re.sub(r'\s+', ' ', actual_query) == re.sub(r'\s+', ' ', expected_query)
diff --git a/tests/unit/recon/test_metadata_retriever.py b/tests/unit/recon/test_metadata_retriever.py
@@ -1,5 +1,3 @@
-import re
-
 from databricks.labs.lsql.backends import MockBackend
 
 from databricks.labs.ucx.recon.base import TableIdentifier, TableMetadata, ColumnMetadata
@@ -59,29 +57,3 @@ def test_unity_table_metadata_retrieval(metadata_row_factory):
     metadata_retriever = DatabricksTableMetadataRetriever(sql_backend)
     actual_metadata = metadata_retriever.get_metadata(table_identifier)
     assert actual_metadata == expected_metadata
-
-
-def test_hms_metadata_query():
-    table_identifier = TableIdentifier("hive_metastore", "db1", "table1")
-    actual_query = DatabricksTableMetadataRetriever.build_metadata_query(table_identifier).strip().lower()
-    expected_query = "DESCRIBE TABLE `hive_metastore`.`db1`.`table1`".lower()
-    assert re.sub(r'\s+', ' ', actual_query) == expected_query
-
-
-def test_unity_metadata_query():
-    table_identifier = TableIdentifier("catalog1", "db1", "table1")
-    actual_query = DatabricksTableMetadataRetriever.build_metadata_query(table_identifier).strip().lower()
-    expected_query = """
-        SELECT 
-            LOWER(column_name) AS col_name, 
-            full_data_type AS data_type
-        FROM 
-            `catalog1`.information_schema.columns
-        WHERE
-            LOWER(table_catalog)='catalog1' AND
-            LOWER(table_schema)='db1' AND
-            LOWER(table_name) ='table1'
-        ORDER BY col_name
-    """.strip().lower()
-
-    assert re.sub(r'\s+', ' ', actual_query) == re.sub(r'\s+', ' ', expected_query)