1- import re
2-
31from databricks .labs .lsql .backends import MockBackend
42
53from databricks .labs .ucx .recon .base import (
64 TableIdentifier ,
75 DataComparisonResult ,
8- DataProfilingResult ,
9- TableMetadata ,
10- ColumnMetadata ,
116)
127from databricks .labs .ucx .recon .data_comparator import StandardDataComparator
138from databricks .labs .ucx .recon .data_profiler import StandardDataProfiler
@@ -22,10 +17,14 @@ def test_data_comparison(metadata_row_factory, row_count_row_factory, data_comp_
2217 f"{ source .catalog } \\ .information_schema\\ .columns" : metadata_row_factory [
2318 ("col1" , "int" ),
2419 ("col2" , "string" ),
20+ ("col3" , "array<string>" ),
21+ ("col4" , "struct<a:int,b:int,c:array<string>>" ),
2522 ],
2623 f"{ target .catalog } \\ .information_schema\\ .columns" : metadata_row_factory [
2724 ("col1" , "int" ),
2825 ("col2" , "string" ),
26+ ("col3" , "array<string>" ),
27+ ("col4" , "struct<a:int,b:int,c:array<string>>" ),
2928 ],
3029 f"SELECT COUNT\\ (\\ *\\ ) as row_count FROM { source .fqn_escaped } " : row_count_row_factory [100 ,],
3130 f"SELECT COUNT\\ (\\ *\\ ) as row_count FROM { target .fqn_escaped } " : row_count_row_factory [2 ,],
@@ -45,64 +44,3 @@ def test_data_comparison(metadata_row_factory, row_count_row_factory, data_comp_
4544 actual_comparison_result = data_comparator .compare_data (source , target , True )
4645
4746 assert actual_comparison_result == expected_comparison_result
48-
49-
50- def test_prepare_data_comparison_query ():
51- source = TableIdentifier ("hive_metastore" , "db1" , "table1" )
52- target = TableIdentifier ("catalog1" , "schema1" , "table2" )
53-
54- source_data_profile = DataProfilingResult (
55- 10 ,
56- TableMetadata (
57- source ,
58- [
59- ColumnMetadata ("col1" , "string" ),
60- ColumnMetadata ("col2" , "array<string>" ),
61- ColumnMetadata ("col3" , "struct<a:int,b:int,c:array<string>>" ),
62- ],
63- ),
64- )
65- target_data_profile = DataProfilingResult (
66- 10 ,
67- TableMetadata (
68- target ,
69- [
70- ColumnMetadata ("col1" , "string" ),
71- ColumnMetadata ("col2" , "array<string>" ),
72- ColumnMetadata ("col3" , "struct<a:int,b:int,c:array<string>>" ),
73- ],
74- ),
75- )
76-
77- actual_query = (
78- StandardDataComparator .build_data_comparison_query (
79- source_data_profile ,
80- target_data_profile ,
81- )
82- .strip ()
83- .lower ()
84- )
85-
86- source_hash_columns = [
87- "COALESCE(TRIM(col1), '')" ,
88- "COALESCE(TRIM(TO_JSON(SORT_ARRAY(col2))), '')" ,
89- "COALESCE(TRIM(TO_JSON(col3)), '')" ,
90- ]
91- target_hash_columns = [
92- "COALESCE(TRIM(col1), '')" ,
93- "COALESCE(TRIM(TO_JSON(SORT_ARRAY(col2))), '')" ,
94- "COALESCE(TRIM(TO_JSON(col3)), '')" ,
95- ]
96-
97- expected_query = (
98- StandardDataComparator .DATA_COMPARISON_QUERY_TEMPLATE .format (
99- source_hash_expr = f"SHA2(CONCAT_WS('|', { ', ' .join (source_hash_columns )} ), 256)" ,
100- target_hash_expr = f"SHA2(CONCAT_WS('|', { ', ' .join (target_hash_columns )} ), 256)" ,
101- source_table_fqn = "`hive_metastore`.`db1`.`table1`" ,
102- target_table_fqn = "`catalog1`.`schema1`.`table2`" ,
103- )
104- .strip ()
105- .lower ()
106- )
107-
108- assert re .sub (r'\s+' , ' ' , actual_query ) == re .sub (r'\s+' , ' ' , expected_query )
0 commit comments