databrickslabs
diff --git a/‎discoverx/discovery.py‎
Lines changed: 4 additions & 3 deletions b/‎discoverx/discovery.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎discoverx/dx.py‎
Lines changed: 5 additions & 5 deletions b/‎discoverx/dx.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎discoverx/explorer.py‎
Lines changed: 59 additions & 5 deletions b/‎discoverx/explorer.py‎
Lines changed: 59 additions & 5 deletions
diff --git a/‎discoverx/msql.py‎
Lines changed: 1 addition & 0 deletions b/‎discoverx/msql.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎discoverx/scanner.py‎
Lines changed: 53 additions & 4 deletions b/‎discoverx/scanner.py‎
Lines changed: 53 additions & 4 deletions
@@ -13,7 +13,7 @@
 class Discovery:
     """ """
 
-    COLUMNS_TABLE_NAME = "system.information_schema.columns"
+    INFORMATION_SCHEMA = "system.information_schema"
     MAX_WORKERS = 10
 
     def __init__(
@@ -73,7 +73,7 @@ def scan(
             rule_filter=rules,
             sample_size=sample_size,
             what_if=what_if,
-            columns_table_name=self.COLUMNS_TABLE_NAME,
+            information_schema=self.INFORMATION_SCHEMA,
             max_workers=self.MAX_WORKERS,
         )
 
@@ -229,7 +229,8 @@ def select_by_classes(
         )
 
         return self._msql(
-            f"SELECT {from_statement}, to_json(struct(*)) AS row_content FROM {from_tables}", min_score=min_score
+            f"SELECT {from_statement}, to_json(struct(*)) AS row_content FROM {from_tables}",
+            min_score=min_score,
         )
 
     def delete_by_class(
 
@@ -25,7 +25,7 @@ class DX:
             Defaults to None.
     """
 
-    COLUMNS_TABLE_NAME = "system.information_schema.columns"
+    INFORMATION_SCHEMA = "system.information_schema"
     MAX_WORKERS = 10
 
     def __init__(
@@ -49,10 +49,10 @@ def __init__(
 
     def _can_read_columns_table(self) -> bool:
         try:
-            self.spark.sql(f"SELECT * FROM {self.COLUMNS_TABLE_NAME} WHERE table_catalog = 'system' LIMIT 1")
+            self.spark.sql(f"SELECT * FROM {self.INFORMATION_SCHEMA}.columns WHERE table_catalog = 'system' LIMIT 1")
             return True
         except Exception as e:
-            self.logger.error(f"Error while reading table {self.COLUMNS_TABLE_NAME}: {e}")
+            self.logger.error(f"Error while reading table {self.INFORMATION_SCHEMA}.columns: {e}")
             return False
 
     def intro(self):
@@ -137,7 +137,7 @@ def scan(
             rule_filter=rules,
             sample_size=sample_size,
             what_if=what_if,
-            columns_table_name=self.COLUMNS_TABLE_NAME,
+            information_schema=self.INFORMATION_SCHEMA,
             max_workers=self.MAX_WORKERS,
         )
 
@@ -400,7 +400,7 @@ def from_tables(self, from_tables: str = "*.*.*"):
 
         """
 
-        return DataExplorer(from_tables, self.spark, InfoFetcher(self.spark, self.COLUMNS_TABLE_NAME))
+        return DataExplorer(from_tables, self.spark, InfoFetcher(self.spark, self.INFORMATION_SCHEMA))
 
     def _msql(self, msql: str, what_if: bool = False, min_score: Optional[float] = None):
         self.logger.debug(f"Executing sql template: {msql}")
 
@@ -6,10 +6,10 @@
 from discoverx.common import helper
 from discoverx.discovery import Discovery
 from discoverx.rules import Rule
+from discoverx.table_info import TagsInfo, ColumnTagInfo, TagInfo, ColumnInfo, TableInfo
 from functools import reduce
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import lit
-
 from discoverx.table_info import InfoFetcher, TableInfo
 
 
@@ -21,12 +21,17 @@ class DataExplorer:
 
     def __init__(self, from_tables, spark: SparkSession, info_fetcher: InfoFetcher) -> None:
         self._from_tables = from_tables
-        self._catalogs, self._schemas, self._tables = DataExplorer.validate_from_components(from_tables)
+        (
+            self._catalogs,
+            self._schemas,
+            self._tables,
+        ) = DataExplorer.validate_from_components(from_tables)
         self._spark = spark
         self._info_fetcher = info_fetcher
         self._having_columns = []
         self._sql_query_template = None
         self._max_concurrency = 10
+        self._with_tags = False
 
     @staticmethod
     def validate_from_components(from_tables: str):
@@ -48,6 +53,7 @@ def __deepcopy__(self, memo):
         new_obj._having_columns = copy.deepcopy(self._having_columns)
         new_obj._sql_query_template = copy.deepcopy(self._sql_query_template)
         new_obj._max_concurrency = copy.deepcopy(self._max_concurrency)
+        new_obj._with_tags = copy.deepcopy(self._with_tags)
 
         new_obj._spark = self._spark
         new_obj._info_fetcher = self._info_fetcher
@@ -70,6 +76,12 @@ def with_concurrency(self, max_concurrency) -> "DataExplorer":
         new_obj._max_concurrency = max_concurrency
         return new_obj
 
+    def with_tags(self, use_tags=True) -> "DataExplorer":
+        """Defines if tags should be collected when getting table metadata"""
+        new_obj = copy.deepcopy(self)
+        new_obj._with_tags = use_tags
+        return new_obj
+
     def with_sql(self, sql_query_template: str) -> "DataExplorerActions":
         """Sets the SQL query template to use for the data exploration
 
@@ -135,10 +147,44 @@ def scan(
         discover.scan(rules=rules, sample_size=sample_size, what_if=what_if)
         return discover
 
+    def map(self, f) -> list[any]:
+        """Runs a function for each table in the data explorer
+
+        Args:
+            f (function): The function to run. The function should accept a TableInfo object as input and return any object as output.
+
+        Returns:
+            list[any]: A list of the results of running the function for each table
+        """
+        res = []
+        table_list = self._info_fetcher.get_tables_info(
+            self._catalogs,
+            self._schemas,
+            self._tables,
+            self._having_columns,
+            self._with_tags,
+        )
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_concurrency) as executor:
+            # Submit tasks to the thread pool
+            futures = [executor.submit(f, table_info) for table_info in table_list]
+
+            # Process completed tasks
+            for future in concurrent.futures.as_completed(futures):
+                result = future.result()
+                if result is not None:
+                    res.append(result)
+
+        logger.debug("Finished lakehouse map task")
+
+        return res
+
 
 class DataExplorerActions:
     def __init__(
-        self, data_explorer: DataExplorer, spark: SparkSession = None, info_fetcher: InfoFetcher = None
+        self,
+        data_explorer: DataExplorer,
+        spark: SparkSession = None,
+        info_fetcher: InfoFetcher = None,
     ) -> None:
         self._data_explorer = data_explorer
         if spark is None:
@@ -193,10 +239,18 @@ def _get_sql_commands(self, data_explorer: DataExplorer) -> list[tuple[str, Tabl
         logger.debug("Launching lakehouse scanning task\n")
 
         table_list = self._info_fetcher.get_tables_info(
-            data_explorer._catalogs, data_explorer._schemas, data_explorer._tables, data_explorer._having_columns
+            data_explorer._catalogs,
+            data_explorer._schemas,
+            data_explorer._tables,
+            data_explorer._having_columns,
+            data_explorer._with_tags,
         )
         sql_commands = [
-            (DataExplorerActions._build_sql(data_explorer._sql_query_template, table), table) for table in table_list
+            (
+                DataExplorerActions._build_sql(data_explorer._sql_query_template, table),
+                table,
+            )
+            for table in table_list
         ]
         return sql_commands
 
 
@@ -97,6 +97,7 @@ def build(self, classified_result_pdf) -> list[SQLRow]:
                 row[1],
                 row[2],
                 [ColumnInfo(col[0], "", None, col[1]) for col in row[3]],  # col name  # TODO  # TODO  # Classes
+                None,
             )
             for _, row in df.iterrows()
             if fnmatch(row[0], self.catalogs) and fnmatch(row[1], self.schemas) and fnmatch(row[2], self.tables)
 
@@ -9,7 +9,7 @@
 
 from discoverx.common.helper import strip_margin, format_regex
 from discoverx import logging
-from discoverx.table_info import InfoFetcher, TableInfo
+from discoverx.table_info import InfoFetcher, TableInfo, ColumnInfo
 from discoverx.rules import Rules, RuleTypes
 
 logger = logging.Logging()
@@ -138,7 +138,7 @@ def __init__(
         rule_filter: str = "*",
         sample_size: int = 1000,
         what_if: bool = False,
-        columns_table_name: str = "",
+        information_schema: str = "",
         max_workers: int = 10,
     ):
         self.spark = spark
@@ -150,18 +150,67 @@ def __init__(
         self.rules_filter = rule_filter
         self.sample_size = sample_size
         self.what_if = what_if
-        self.columns_table_name = columns_table_name
+        self.information_schema = information_schema
         self.max_workers = max_workers
 
         self.content: ScanContent = self._resolve_scan_content()
         self.rule_list = self.rules.get_rules(rule_filter=self.rules_filter)
         self.scan_result: Optional[ScanResult] = None
 
+    def _get_list_of_tables(self) -> List[TableInfo]:
+        table_list_sql = self._get_table_list_sql()
+
+        rows = self.spark.sql(table_list_sql).collect()
+        filtered_tables = [
+            TableInfo(
+                row["table_catalog"],
+                row["table_schema"],
+                row["table_name"],
+                [
+                    ColumnInfo(col["column_name"], col["data_type"], col["partition_index"], [])
+                    for col in row["table_columns"]
+                ],
+                None,
+            )
+            for row in rows
+        ]
+        return filtered_tables
+
+    def _get_table_list_sql(self):
+        """
+        Returns a SQL expression which returns a list of columns matching
+        the specified filters
+
+        Returns:
+            string: The SQL expression
+        """
+
+        catalog_sql = f"""AND regexp_like(table_catalog, "^{self.catalogs.replace("*", ".*")}$")"""
+        schema_sql = f"""AND regexp_like(table_schema, "^{self.schemas.replace("*", ".*")}$")"""
+        table_sql = f"""AND regexp_like(table_name, "^{self.tables.replace("*", ".*")}$")"""
+
+        sql = f"""
+        SELECT 
+            table_catalog, 
+            table_schema, 
+            table_name, 
+            collect_list(struct(column_name, data_type, partition_index)) as table_columns
+        FROM {self.information_schema}.columns
+        WHERE 
+            table_schema != "information_schema" 
+            {catalog_sql if self.catalogs != "*" else ""}
+            {schema_sql if self.schemas != "*" else ""}
+            {table_sql if self.tables != "*" else ""}
+        GROUP BY table_catalog, table_schema, table_name
+        """
+
+        return strip_margin(sql)
+
     def _resolve_scan_content(self) -> ScanContent:
         if self.table_list:
             table_list = self.table_list
         else:
-            info_fetcher = InfoFetcher(self.spark, columns_table_name=self.columns_table_name)
+            info_fetcher = InfoFetcher(self.spark, information_schema=self.information_schema)
             table_list = info_fetcher.get_tables_info(self.catalogs, self.schemas, self.tables)
         catalogs = set(map(lambda x: x.catalog, table_list))
         schemas = set(map(lambda x: f"{x.catalog}.{x.schema}", table_list))
Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,7 @@ def build(self, classified_result_pdf) -> list[SQLRow]:`
`97`	`97`	`row[1],`
`98`	`98`	`row[2],`
`99`	`99`	`[ColumnInfo(col[0], "", None, col[1]) for col in row[3]], # col name # TODO # TODO # Classes`
	`100`	`+ None,`
`100`	`101`	`)`
`101`	`102`	`for _, row in df.iterrows()`
`102`	`103`	`if fnmatch(row[0], self.catalogs) and fnmatch(row[1], self.schemas) and fnmatch(row[2], self.tables)`