66from discoverx .common import helper
77from discoverx .discovery import Discovery
88from discoverx .rules import Rule
9+ from discoverx .table_info import TagsInfo , ColumnTagInfo , TagInfo , ColumnInfo , TableInfo
910from functools import reduce
1011from pyspark .sql import DataFrame , SparkSession
1112from pyspark .sql .functions import lit
12-
1313from discoverx .table_info import InfoFetcher , TableInfo
1414
1515
@@ -21,12 +21,17 @@ class DataExplorer:
2121
2222 def __init__ (self , from_tables , spark : SparkSession , info_fetcher : InfoFetcher ) -> None :
2323 self ._from_tables = from_tables
24- self ._catalogs , self ._schemas , self ._tables = DataExplorer .validate_from_components (from_tables )
24+ (
25+ self ._catalogs ,
26+ self ._schemas ,
27+ self ._tables ,
28+ ) = DataExplorer .validate_from_components (from_tables )
2529 self ._spark = spark
2630 self ._info_fetcher = info_fetcher
2731 self ._having_columns = []
2832 self ._sql_query_template = None
2933 self ._max_concurrency = 10
34+ self ._with_tags = False
3035
3136 @staticmethod
3237 def validate_from_components (from_tables : str ):
@@ -48,6 +53,7 @@ def __deepcopy__(self, memo):
4853 new_obj ._having_columns = copy .deepcopy (self ._having_columns )
4954 new_obj ._sql_query_template = copy .deepcopy (self ._sql_query_template )
5055 new_obj ._max_concurrency = copy .deepcopy (self ._max_concurrency )
56+ new_obj ._with_tags = copy .deepcopy (self ._with_tags )
5157
5258 new_obj ._spark = self ._spark
5359 new_obj ._info_fetcher = self ._info_fetcher
@@ -70,6 +76,12 @@ def with_concurrency(self, max_concurrency) -> "DataExplorer":
7076 new_obj ._max_concurrency = max_concurrency
7177 return new_obj
7278
79+ def with_tags (self , use_tags = True ) -> "DataExplorer" :
80+ """Defines if tags should be collected when getting table metadata"""
81+ new_obj = copy .deepcopy (self )
82+ new_obj ._with_tags = use_tags
83+ return new_obj
84+
7385 def with_sql (self , sql_query_template : str ) -> "DataExplorerActions" :
7486 """Sets the SQL query template to use for the data exploration
7587
@@ -135,10 +147,44 @@ def scan(
135147 discover .scan (rules = rules , sample_size = sample_size , what_if = what_if )
136148 return discover
137149
150+ def map (self , f ) -> list [any ]:
151+ """Runs a function for each table in the data explorer
152+
153+ Args:
154+ f (function): The function to run. The function should accept a TableInfo object as input and return any object as output.
155+
156+ Returns:
157+ list[any]: A list of the results of running the function for each table
158+ """
159+ res = []
160+ table_list = self ._info_fetcher .get_tables_info (
161+ self ._catalogs ,
162+ self ._schemas ,
163+ self ._tables ,
164+ self ._having_columns ,
165+ self ._with_tags ,
166+ )
167+ with concurrent .futures .ThreadPoolExecutor (max_workers = self ._max_concurrency ) as executor :
168+ # Submit tasks to the thread pool
169+ futures = [executor .submit (f , table_info ) for table_info in table_list ]
170+
171+ # Process completed tasks
172+ for future in concurrent .futures .as_completed (futures ):
173+ result = future .result ()
174+ if result is not None :
175+ res .append (result )
176+
177+ logger .debug ("Finished lakehouse map task" )
178+
179+ return res
180+
138181
139182class DataExplorerActions :
140183 def __init__ (
141- self , data_explorer : DataExplorer , spark : SparkSession = None , info_fetcher : InfoFetcher = None
184+ self ,
185+ data_explorer : DataExplorer ,
186+ spark : SparkSession = None ,
187+ info_fetcher : InfoFetcher = None ,
142188 ) -> None :
143189 self ._data_explorer = data_explorer
144190 if spark is None :
@@ -193,10 +239,18 @@ def _get_sql_commands(self, data_explorer: DataExplorer) -> list[tuple[str, Tabl
193239 logger .debug ("Launching lakehouse scanning task\n " )
194240
195241 table_list = self ._info_fetcher .get_tables_info (
196- data_explorer ._catalogs , data_explorer ._schemas , data_explorer ._tables , data_explorer ._having_columns
242+ data_explorer ._catalogs ,
243+ data_explorer ._schemas ,
244+ data_explorer ._tables ,
245+ data_explorer ._having_columns ,
246+ data_explorer ._with_tags ,
197247 )
198248 sql_commands = [
199- (DataExplorerActions ._build_sql (data_explorer ._sql_query_template , table ), table ) for table in table_list
249+ (
250+ DataExplorerActions ._build_sql (data_explorer ._sql_query_template , table ),
251+ table ,
252+ )
253+ for table in table_list
200254 ]
201255 return sql_commands
202256
0 commit comments