|
2 | 2 | import copy |
3 | 3 | import re |
4 | 4 | from typing import Optional, List |
5 | | - |
6 | 5 | from discoverx import logging |
7 | 6 | from discoverx.common import helper |
8 | 7 | from discoverx.discovery import Discovery |
9 | 8 | from discoverx.rules import Rule |
10 | | -from discoverx.scanner import ColumnInfo, TableInfo |
11 | 9 | from functools import reduce |
12 | 10 | from pyspark.sql import DataFrame, SparkSession |
13 | 11 | from pyspark.sql.functions import lit |
14 | | -from pyspark.sql.types import Row |
15 | | - |
16 | | -logger = logging.Logging() |
17 | | - |
18 | | - |
19 | | -class InfoFetcher: |
20 | | - def __init__(self, spark, columns_table_name="system.information_schema.columns") -> None: |
21 | | - self.columns_table_name = columns_table_name |
22 | | - self.spark = spark |
23 | | - |
24 | | - def _to_info_list(self, info_rows: list[Row]) -> list[TableInfo]: |
25 | | - filtered_tables = [ |
26 | | - TableInfo( |
27 | | - row["table_catalog"], |
28 | | - row["table_schema"], |
29 | | - row["table_name"], |
30 | | - [ |
31 | | - ColumnInfo(col["column_name"], col["data_type"], col["partition_index"], []) |
32 | | - for col in row["table_columns"] |
33 | | - ], |
34 | | - ) |
35 | | - for row in info_rows |
36 | | - ] |
37 | | - return filtered_tables |
38 | | - |
39 | | - def get_tables_info(self, catalogs: str, schemas: str, tables: str, columns: list[str] = []) -> list[TableInfo]: |
40 | | - # Filter tables by matching filter |
41 | | - table_list_sql = self._get_table_list_sql(catalogs, schemas, tables, columns) |
42 | | - |
43 | | - filtered_tables = self.spark.sql(table_list_sql).collect() |
44 | | - |
45 | | - if len(filtered_tables) == 0: |
46 | | - raise ValueError(f"No tables found matching filter: {catalogs}.{schemas}.{tables}") |
47 | | - |
48 | | - return self._to_info_list(filtered_tables) |
49 | | - |
50 | | - def _get_table_list_sql(self, catalogs: str, schemas: str, tables: str, columns: list[str] = []) -> str: |
51 | | - """ |
52 | | - Returns a SQL expression which returns a list of columns matching |
53 | | - the specified filters |
54 | | -
|
55 | | - Returns: |
56 | | - string: The SQL expression |
57 | | - """ |
58 | | - |
59 | | - if "*" in catalogs: |
60 | | - catalog_sql = f"""AND regexp_like(table_catalog, "^{catalogs.replace("*", ".*")}$")""" |
61 | | - else: |
62 | | - catalog_sql = f"""AND table_catalog = "{catalogs}" """ |
63 | | - |
64 | | - if "*" in schemas: |
65 | | - schema_sql = f"""AND regexp_like(table_schema, "^{schemas.replace("*", ".*")}$")""" |
66 | | - else: |
67 | | - schema_sql = f"""AND table_schema = "{schemas}" """ |
68 | 12 |
|
69 | | - if "*" in tables: |
70 | | - table_sql = f"""AND regexp_like(table_name, "^{tables.replace("*", ".*")}$")""" |
71 | | - else: |
72 | | - table_sql = f"""AND table_name = "{tables}" """ |
73 | | - |
74 | | - if columns: |
75 | | - match_any_col = "|".join([f'({c.replace("*", ".*")})' for c in columns]) |
76 | | - columns_sql = f"""AND regexp_like(column_name, "^{match_any_col}$")""" |
77 | | - |
78 | | - sql = f""" |
79 | | - WITH tb_list AS ( |
80 | | - SELECT DISTINCT |
81 | | - table_catalog, |
82 | | - table_schema, |
83 | | - table_name |
84 | | - FROM {self.columns_table_name} |
85 | | - WHERE |
86 | | - table_schema != "information_schema" |
87 | | - {catalog_sql if catalogs != "*" else ""} |
88 | | - {schema_sql if schemas != "*" else ""} |
89 | | - {table_sql if tables != "*" else ""} |
90 | | - {columns_sql if columns else ""} |
91 | | - ) |
| 13 | +from discoverx.table_info import InfoFetcher, TableInfo |
92 | 14 |
|
93 | | - SELECT |
94 | | - info_schema.table_catalog, |
95 | | - info_schema.table_schema, |
96 | | - info_schema.table_name, |
97 | | - collect_list(struct(column_name, data_type, partition_index)) as table_columns |
98 | | - FROM {self.columns_table_name} info_schema |
99 | | - INNER JOIN tb_list ON ( |
100 | | - info_schema.table_catalog <=> tb_list.table_catalog AND |
101 | | - info_schema.table_schema = tb_list.table_schema AND |
102 | | - info_schema.table_name = tb_list.table_name) |
103 | | - GROUP BY info_schema.table_catalog, info_schema.table_schema, info_schema.table_name |
104 | | - """ |
105 | 15 |
|
106 | | - return helper.strip_margin(sql) |
| 16 | +logger = logging.Logging() |
107 | 17 |
|
108 | 18 |
|
109 | 19 | class DataExplorer: |
|
0 commit comments