Skip to content

Commit f3f20ca

Browse files
author
Michiel De Smet
committed
Support sql check
1 parent f20c81b commit f3f20ca

File tree

12 files changed

+228
-69
lines changed

12 files changed

+228
-69
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def read(*names, **kwargs):
6767
"ruamel.yaml==0.18.6",
6868
"tabulate==0.9.0",
6969
"requests==2.31.0",
70+
"sqlglot",
7071
],
7172
extras_require={
7273
# eg:
Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,11 @@
11
from abc import abstractmethod
2-
from typing import Optional
32

4-
from datapilot.core.insights.base.insight import Insight
5-
from datapilot.schemas.sql import Dialect
3+
from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
64

75

8-
class SqlInsight(Insight):
6+
class SqlInsight(ChecksInsight):
97
NAME = "SqlInsight"
108

11-
def __init__(self, sql: str, dialect: Optional[Dialect], *args, **kwargs):
12-
self.sql = sql
13-
self.dialect = dialect
14-
super().__init__(*args, **kwargs)
15-
169
@abstractmethod
1710
def generate(self, *args, **kwargs) -> dict:
1811
pass

src/datapilot/core/platforms/dbt/executor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(
5151
self.macros = self.manifest_wrapper.get_macros()
5252
self.sources = self.manifest_wrapper.get_sources()
5353
self.exposures = self.manifest_wrapper.get_exposures()
54+
self.adapter_type = self.manifest_wrapper.get_adapter_type()
5455
self.seeds = self.manifest_wrapper.get_seeds()
5556
self.children_map = self.manifest_wrapper.parent_to_child_map(self.nodes)
5657
self.tests = self.manifest_wrapper.get_tests()
@@ -112,6 +113,7 @@ def run(self):
112113
children_map=self.children_map,
113114
tests=self.tests,
114115
project_name=self.project_name,
116+
adapter_type=self.adapter_type,
115117
config=self.config,
116118
selected_models=self.selected_models,
117119
excluded_models=self.excluded_models,

src/datapilot/core/platforms/dbt/insights/__init__.py

Lines changed: 57 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -51,65 +51,67 @@
5151
from datapilot.core.platforms.dbt.insights.modelling.unused_sources import DBTUnusedSources
5252
from datapilot.core.platforms.dbt.insights.performance.chain_view_linking import DBTChainViewLinking
5353
from datapilot.core.platforms.dbt.insights.performance.exposure_parent_materializations import DBTExposureParentMaterialization
54+
from datapilot.core.platforms.dbt.insights.sql.sql_check import SqlCheck
5455
from datapilot.core.platforms.dbt.insights.structure.model_directories_structure import DBTModelDirectoryStructure
5556
from datapilot.core.platforms.dbt.insights.structure.model_naming_conventions import DBTModelNamingConvention
5657
from datapilot.core.platforms.dbt.insights.structure.source_directories_structure import DBTSourceDirectoryStructure
5758
from datapilot.core.platforms.dbt.insights.structure.test_directory_structure import DBTTestDirectoryStructure
5859

5960
INSIGHTS = [
60-
DBTDirectJoinSource,
61-
DBTDownstreamModelsDependentOnSource,
62-
DBTDuplicateSources,
63-
DBTModelFanout,
64-
DBTRootModel,
65-
DBTSourceFanout,
66-
DBTStagingModelsDependentOnDownstreamModels,
67-
DBTStagingModelsDependentOnStagingModels,
68-
DBTUnusedSources,
69-
DBTModelsMultipleSourcesJoined,
70-
DBTHardCodedReferences,
71-
DBTRejoiningOfUpstreamConcepts,
72-
DBTExposureDependentOnPrivateModels,
73-
DBTUndocumentedPublicModels,
74-
DBTPublicModelWithoutContracts,
75-
DBTChainViewLinking,
76-
DBTExposureParentMaterialization,
77-
DBTMissingDocumentation,
78-
DBTDocumentationStaleColumns,
79-
MissingPrimaryKeyTests,
80-
DBTTestCoverage,
81-
DBTModelDirectoryStructure,
82-
DBTModelNamingConvention,
83-
DBTSourceDirectoryStructure,
84-
DBTTestDirectoryStructure,
85-
CheckColumnDescAreSame,
86-
CheckColumnNameContract,
87-
CheckMacroArgsHaveDesc,
88-
CheckMacroHasDesc,
89-
CheckModelHasAllColumns,
90-
# CheckModelHasLabelsKeys,
91-
CheckModelHasMetaKeys,
92-
CheckModelHasPropertiesFile,
93-
CheckModelHasTestsByName,
94-
CheckModelHasTestsByType,
95-
CheckModelHasTestsByGroup,
96-
CheckModelMaterializationByChilds,
97-
CheckModelNameContract,
98-
CheckModelParentsAndChilds,
99-
CheckModelParentsDatabase,
100-
CheckModelParentsSchema,
101-
CheckModelTags,
102-
CheckSourceChilds,
103-
CheckSourceColumnsHaveDescriptions,
104-
CheckSourceHasAllColumns,
105-
CheckSourceHasFreshness,
106-
# CheckSourceHasLabelsKeys,
107-
CheckSourceHasLoader,
108-
CheckSourceHasMetaKeys,
109-
CheckSourceHasTestsByName,
110-
CheckSourceHasTestsByType,
111-
CheckSourceHasTestsByGroup,
112-
CheckSourceHasTests,
113-
CheckSourceTableHasDescription,
114-
CheckSourceTags,
61+
# DBTDirectJoinSource,
62+
# DBTDownstreamModelsDependentOnSource,
63+
# DBTDuplicateSources,
64+
# DBTModelFanout,
65+
# DBTRootModel,
66+
# DBTSourceFanout,
67+
# DBTStagingModelsDependentOnDownstreamModels,
68+
# DBTStagingModelsDependentOnStagingModels,
69+
# DBTUnusedSources,
70+
# DBTModelsMultipleSourcesJoined,
71+
# DBTHardCodedReferences,
72+
# DBTRejoiningOfUpstreamConcepts,
73+
# DBTExposureDependentOnPrivateModels,
74+
# DBTUndocumentedPublicModels,
75+
# DBTPublicModelWithoutContracts,
76+
# DBTChainViewLinking,
77+
# DBTExposureParentMaterialization,
78+
# DBTMissingDocumentation,
79+
# DBTDocumentationStaleColumns,
80+
# MissingPrimaryKeyTests,
81+
# DBTTestCoverage,
82+
# DBTModelDirectoryStructure,
83+
# DBTModelNamingConvention,
84+
# DBTSourceDirectoryStructure,
85+
# DBTTestDirectoryStructure,
86+
# CheckColumnDescAreSame,
87+
# CheckColumnNameContract,
88+
# CheckMacroArgsHaveDesc,
89+
# CheckMacroHasDesc,
90+
# CheckModelHasAllColumns,
91+
# # CheckModelHasLabelsKeys,
92+
# CheckModelHasMetaKeys,
93+
# CheckModelHasPropertiesFile,
94+
# CheckModelHasTestsByName,
95+
# CheckModelHasTestsByType,
96+
# CheckModelHasTestsByGroup,
97+
# CheckModelMaterializationByChilds,
98+
# CheckModelNameContract,
99+
# CheckModelParentsAndChilds,
100+
# CheckModelParentsDatabase,
101+
# CheckModelParentsSchema,
102+
# CheckModelTags,
103+
# CheckSourceChilds,
104+
# CheckSourceColumnsHaveDescriptions,
105+
# CheckSourceHasAllColumns,
106+
# CheckSourceHasFreshness,
107+
# # CheckSourceHasLabelsKeys,
108+
# CheckSourceHasLoader,
109+
# CheckSourceHasMetaKeys,
110+
# CheckSourceHasTestsByName,
111+
# CheckSourceHasTestsByType,
112+
# CheckSourceHasTestsByGroup,
113+
# CheckSourceHasTests,
114+
# CheckSourceTableHasDescription,
115+
# CheckSourceTags,
116+
SqlCheck,
115117
]

src/datapilot/core/platforms/dbt/insights/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import abstractmethod
2-
from typing import ClassVar
2+
from typing import ClassVar, Optional
33
from typing import Dict
44
from typing import List
55
from typing import Union
@@ -33,6 +33,7 @@ def __init__(
3333
macros: Dict[str, AltimateManifestMacroNode],
3434
children_map: Dict[str, List[str]],
3535
project_name: str,
36+
adapter_type: Optional[str],
3637
selected_models: Union[List[str], None] = None,
3738
excluded_models: Union[List[str], None] = None,
3839
*args,
@@ -47,6 +48,7 @@ def __init__(
4748
self.seeds = seeds
4849
self.children_map = children_map
4950
self.project_name = project_name
51+
self.adapter_type = adapter_type
5052
self.selected_models = selected_models
5153
self.excluded_models = excluded_models
5254
super().__init__(*args, **kwargs)

src/datapilot/core/platforms/dbt/insights/sql/__init__.py

Whitespace-only changes.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from abc import abstractmethod
2+
from typing import Tuple
3+
4+
from datapilot.core.platforms.dbt.insights.base import DBTInsight
5+
6+
7+
class SqlInsight(DBTInsight):
8+
TYPE = "governance"
9+
10+
@abstractmethod
11+
def generate(self, *args, **kwargs) -> dict:
12+
pass
13+
14+
@classmethod
15+
def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
16+
"""
17+
Check if all required data is available for the insight to run.
18+
:param has_manifest: A boolean indicating if manifest is available.
19+
:return: A boolean indicating if all required data is available.
20+
"""
21+
if not has_manifest:
22+
return False, "manifest is required for insight to run."
23+
return True, ""
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import inspect
2+
from typing import List, Tuple
3+
4+
from sqlglot import parse_one
5+
from sqlglot.optimizer.eliminate_ctes import eliminate_ctes
6+
from sqlglot.optimizer.eliminate_joins import eliminate_joins
7+
from sqlglot.optimizer.eliminate_subqueries import eliminate_subqueries
8+
from sqlglot.optimizer.normalize import normalize
9+
from sqlglot.optimizer.pushdown_projections import pushdown_projections
10+
from sqlglot.optimizer.qualify import qualify
11+
from sqlglot.optimizer.unnest_subqueries import unnest_subqueries
12+
13+
from datapilot.core.insights.sql.base.insight import SqlInsight
14+
from datapilot.core.insights.utils import get_severity
15+
from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult, DBTModelInsightResponse
16+
17+
RULES = (
18+
pushdown_projections,
19+
normalize,
20+
unnest_subqueries,
21+
eliminate_subqueries,
22+
eliminate_joins,
23+
eliminate_ctes,
24+
)
25+
26+
class SqlCheck(SqlInsight):
27+
"""
28+
This class identifies DBT models with test coverage below a specified threshold.
29+
It aims to ensure that a minimum percentage of tests are applied to each model to maintain data integrity.
30+
"""
31+
32+
NAME = "sql optimization issues"
33+
ALIAS = "check_sql_optimization"
34+
DESCRIPTION = "Checks if the model has SQL optimization issues. "
35+
REASON_TO_FLAG = "The query can be optimized."
36+
FAILURE_MESSAGE = "The query for model `{model_unique_id}` has optimization opportunities:\n{rule_name}. "
37+
RECOMMENDATION = "Please adapt the query of the model `{model_unique_id}` as in following example:\n{optimized_sql}"
38+
39+
def _build_failure_result(self, model_unique_id: str, rule_name: str, optimized_sql: str) -> DBTInsightResult:
40+
"""
41+
Constructs a failure result for a given model with low test coverage.
42+
:param coverage: The calculated test coverage percentage for the model.
43+
:param min_coverage: The minimum required test coverage percentage.
44+
:return: An instance of DBTInsightResult containing failure details.
45+
"""
46+
failure_message = self.FAILURE_MESSAGE.format(model_unique_id=model_unique_id, rule_name=rule_name)
47+
recommendation = self.RECOMMENDATION.format(model_unique_id=model_unique_id, optimized_sql=optimized_sql)
48+
return DBTInsightResult(
49+
type=self.TYPE,
50+
name=self.NAME,
51+
message=failure_message,
52+
recommendation=recommendation,
53+
reason_to_flag=self.REASON_TO_FLAG,
54+
metadata={"model_unique_id": model_unique_id, "rule_name": rule_name},
55+
)
56+
57+
def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
58+
"""
59+
Generates insights for each DBT model in the project, focusing on test coverage.
60+
61+
:return: A list of DBTModelInsightResponse objects with insights for each model.
62+
"""
63+
self.logger.debug("Generating sql insights for DBT models")
64+
insights = []
65+
66+
possible_kwargs = {
67+
"db": None,
68+
"catalog": None,
69+
"dialect": self.adapter_type,
70+
"isolate_tables": True, # needed for other optimizations to perform well
71+
"quote_identifiers": False,
72+
**kwargs,
73+
}
74+
for node_id, node in self.nodes.items():
75+
try:
76+
compiled_query = node.compiled_code
77+
if compiled_query:
78+
parsed_query = parse_one(compiled_query, dialect=self.adapter_type)
79+
qualified = qualify(parsed_query, **possible_kwargs)
80+
changed = qualified.copy()
81+
for rule in RULES:
82+
original = changed.copy()
83+
rule_params = inspect.getfullargspec(rule).args
84+
rule_kwargs = {
85+
param: possible_kwargs[param]
86+
for param in rule_params
87+
if param in possible_kwargs
88+
}
89+
changed = rule(changed, **rule_kwargs)
90+
if changed.sql() != original.sql():
91+
insights.append(
92+
DBTModelInsightResponse(
93+
unique_id=node_id,
94+
package_name=node.package_name,
95+
path=node.original_file_path,
96+
original_file_path=node.original_file_path,
97+
insight=self._build_failure_result(
98+
node_id,
99+
rule.__name__,
100+
changed.sql()
101+
),
102+
severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
103+
)
104+
)
105+
except Exception as e:
106+
self.logger.error(e)
107+
return insights
108+
109+
def parse_query(
110+
query: str,
111+
dialect: str = "snowflake",
112+
):
113+
"""
114+
Parses the query and returns the parsed query object
115+
"""
116+
try:
117+
parsed = parse_one(query, read=dialect)
118+
except Exception as e:
119+
parsed = None
120+
return parsed

src/datapilot/core/platforms/dbt/wrappers/manifest/v10/wrapper.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict
1+
from typing import Dict, Optional
22
from typing import Set
33

44
from dbt_artifacts_parser.parsers.manifest.manifest_v10 import GenericTestNode
@@ -67,6 +67,7 @@ def _get_node(self, node: ManifestNode) -> AltimateManifestNode:
6767
depends_on_macros = node.depends_on.macros if node.depends_on else None
6868
compiled_path = node.compiled_path
6969
compiled = node.compiled
70+
compiled_code = node.compiled_code
7071
raw_code = node.raw_code
7172
language = node.language
7273
contract = AltimateDBTContract(**node.contract.__dict__) if node.contract else None
@@ -381,6 +382,9 @@ def get_seeds(self) -> Dict[str, AltimateSeedNode]:
381382
seeds[seed.unique_id] = self._get_seed(seed)
382383
return seeds
383384

385+
def get_adapter_type(self) -> Optional[str]:
386+
return self.manifest.metadata.adapter_type
387+
384388
def parent_to_child_map(self, nodes: Dict[str, AltimateManifestNode]) -> Dict[str, Set[str]]:
385389
"""
386390
Current manifest contains information about parents

src/datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict
1+
from typing import Dict, Optional
22
from typing import Set
33

44
from dbt_artifacts_parser.parsers.manifest.manifest_v11 import GenericTestNode
@@ -67,6 +67,7 @@ def _get_node(self, node: ManifestNode) -> AltimateManifestNode:
6767
depends_on_macros = node.depends_on.macros if node.depends_on else None
6868
compiled_path = node.compiled_path
6969
compiled = node.compiled
70+
compiled_code = node.compiled_code
7071
raw_code = node.raw_code
7172
language = node.language
7273
contract = AltimateDBTContract(**node.contract.__dict__) if node.contract else None
@@ -381,6 +382,9 @@ def get_seeds(self) -> Dict[str, AltimateSeedNode]:
381382
seeds[seed.unique_id] = self._get_seed(seed)
382383
return seeds
383384

385+
def get_adapter_type(self) -> Optional[str]:
386+
return self.manifest.metadata.adapter_type
387+
384388
def parent_to_child_map(self, nodes: Dict[str, AltimateManifestNode]) -> Dict[str, Set[str]]:
385389
"""
386390
Current manifest contains information about parents

0 commit comments

Comments
 (0)