Support sql check

Michiel De Smet · Michiel De Smet · commit f3f20cad93a0 · 2024-11-14T14:32:48.000+08:00
diff --git a/setup.py b/setup.py
@@ -67,6 +67,7 @@ def read(*names, **kwargs):
         "ruamel.yaml==0.18.6",
         "tabulate==0.9.0",
         "requests==2.31.0",
+        "sqlglot",
     ],
     extras_require={
         # eg:
diff --git a/src/datapilot/core/insights/sql/base/insight.py b/src/datapilot/core/insights/sql/base/insight.py
@@ -1,18 +1,11 @@
 from abc import abstractmethod
-from typing import Optional
 
-from datapilot.core.insights.base.insight import Insight
-from datapilot.schemas.sql import Dialect
+from datapilot.core.platforms.dbt.insights.checks.base import ChecksInsight
 
 
-class SqlInsight(Insight):
+class SqlInsight(ChecksInsight):
     NAME = "SqlInsight"
 
-    def __init__(self, sql: str, dialect: Optional[Dialect], *args, **kwargs):
-        self.sql = sql
-        self.dialect = dialect
-        super().__init__(*args, **kwargs)
-
     @abstractmethod
     def generate(self, *args, **kwargs) -> dict:
         pass
diff --git a/src/datapilot/core/platforms/dbt/executor.py b/src/datapilot/core/platforms/dbt/executor.py
@@ -51,6 +51,7 @@ def __init__(
         self.macros = self.manifest_wrapper.get_macros()
         self.sources = self.manifest_wrapper.get_sources()
         self.exposures = self.manifest_wrapper.get_exposures()
+        self.adapter_type = self.manifest_wrapper.get_adapter_type()
         self.seeds = self.manifest_wrapper.get_seeds()
         self.children_map = self.manifest_wrapper.parent_to_child_map(self.nodes)
         self.tests = self.manifest_wrapper.get_tests()
@@ -112,6 +113,7 @@ def run(self):
                     children_map=self.children_map,
                     tests=self.tests,
                     project_name=self.project_name,
+                    adapter_type=self.adapter_type,
                     config=self.config,
                     selected_models=self.selected_models,
                     excluded_models=self.excluded_models,
diff --git a/src/datapilot/core/platforms/dbt/insights/__init__.py b/src/datapilot/core/platforms/dbt/insights/__init__.py
@@ -51,65 +51,67 @@
 from datapilot.core.platforms.dbt.insights.modelling.unused_sources import DBTUnusedSources
 from datapilot.core.platforms.dbt.insights.performance.chain_view_linking import DBTChainViewLinking
 from datapilot.core.platforms.dbt.insights.performance.exposure_parent_materializations import DBTExposureParentMaterialization
+from datapilot.core.platforms.dbt.insights.sql.sql_check import SqlCheck
 from datapilot.core.platforms.dbt.insights.structure.model_directories_structure import DBTModelDirectoryStructure
 from datapilot.core.platforms.dbt.insights.structure.model_naming_conventions import DBTModelNamingConvention
 from datapilot.core.platforms.dbt.insights.structure.source_directories_structure import DBTSourceDirectoryStructure
 from datapilot.core.platforms.dbt.insights.structure.test_directory_structure import DBTTestDirectoryStructure
 
 INSIGHTS = [
-    DBTDirectJoinSource,
-    DBTDownstreamModelsDependentOnSource,
-    DBTDuplicateSources,
-    DBTModelFanout,
-    DBTRootModel,
-    DBTSourceFanout,
-    DBTStagingModelsDependentOnDownstreamModels,
-    DBTStagingModelsDependentOnStagingModels,
-    DBTUnusedSources,
-    DBTModelsMultipleSourcesJoined,
-    DBTHardCodedReferences,
-    DBTRejoiningOfUpstreamConcepts,
-    DBTExposureDependentOnPrivateModels,
-    DBTUndocumentedPublicModels,
-    DBTPublicModelWithoutContracts,
-    DBTChainViewLinking,
-    DBTExposureParentMaterialization,
-    DBTMissingDocumentation,
-    DBTDocumentationStaleColumns,
-    MissingPrimaryKeyTests,
-    DBTTestCoverage,
-    DBTModelDirectoryStructure,
-    DBTModelNamingConvention,
-    DBTSourceDirectoryStructure,
-    DBTTestDirectoryStructure,
-    CheckColumnDescAreSame,
-    CheckColumnNameContract,
-    CheckMacroArgsHaveDesc,
-    CheckMacroHasDesc,
-    CheckModelHasAllColumns,
-    # CheckModelHasLabelsKeys,
-    CheckModelHasMetaKeys,
-    CheckModelHasPropertiesFile,
-    CheckModelHasTestsByName,
-    CheckModelHasTestsByType,
-    CheckModelHasTestsByGroup,
-    CheckModelMaterializationByChilds,
-    CheckModelNameContract,
-    CheckModelParentsAndChilds,
-    CheckModelParentsDatabase,
-    CheckModelParentsSchema,
-    CheckModelTags,
-    CheckSourceChilds,
-    CheckSourceColumnsHaveDescriptions,
-    CheckSourceHasAllColumns,
-    CheckSourceHasFreshness,
-    # CheckSourceHasLabelsKeys,
-    CheckSourceHasLoader,
-    CheckSourceHasMetaKeys,
-    CheckSourceHasTestsByName,
-    CheckSourceHasTestsByType,
-    CheckSourceHasTestsByGroup,
-    CheckSourceHasTests,
-    CheckSourceTableHasDescription,
-    CheckSourceTags,
+    # DBTDirectJoinSource,
+    # DBTDownstreamModelsDependentOnSource,
+    # DBTDuplicateSources,
+    # DBTModelFanout,
+    # DBTRootModel,
+    # DBTSourceFanout,
+    # DBTStagingModelsDependentOnDownstreamModels,
+    # DBTStagingModelsDependentOnStagingModels,
+    # DBTUnusedSources,
+    # DBTModelsMultipleSourcesJoined,
+    # DBTHardCodedReferences,
+    # DBTRejoiningOfUpstreamConcepts,
+    # DBTExposureDependentOnPrivateModels,
+    # DBTUndocumentedPublicModels,
+    # DBTPublicModelWithoutContracts,
+    # DBTChainViewLinking,
+    # DBTExposureParentMaterialization,
+    # DBTMissingDocumentation,
+    # DBTDocumentationStaleColumns,
+    # MissingPrimaryKeyTests,
+    # DBTTestCoverage,
+    # DBTModelDirectoryStructure,
+    # DBTModelNamingConvention,
+    # DBTSourceDirectoryStructure,
+    # DBTTestDirectoryStructure,
+    # CheckColumnDescAreSame,
+    # CheckColumnNameContract,
+    # CheckMacroArgsHaveDesc,
+    # CheckMacroHasDesc,
+    # CheckModelHasAllColumns,
+    # # CheckModelHasLabelsKeys,
+    # CheckModelHasMetaKeys,
+    # CheckModelHasPropertiesFile,
+    # CheckModelHasTestsByName,
+    # CheckModelHasTestsByType,
+    # CheckModelHasTestsByGroup,
+    # CheckModelMaterializationByChilds,
+    # CheckModelNameContract,
+    # CheckModelParentsAndChilds,
+    # CheckModelParentsDatabase,
+    # CheckModelParentsSchema,
+    # CheckModelTags,
+    # CheckSourceChilds,
+    # CheckSourceColumnsHaveDescriptions,
+    # CheckSourceHasAllColumns,
+    # CheckSourceHasFreshness,
+    # # CheckSourceHasLabelsKeys,
+    # CheckSourceHasLoader,
+    # CheckSourceHasMetaKeys,
+    # CheckSourceHasTestsByName,
+    # CheckSourceHasTestsByType,
+    # CheckSourceHasTestsByGroup,
+    # CheckSourceHasTests,
+    # CheckSourceTableHasDescription,
+    # CheckSourceTags,
+    SqlCheck,
 ]
diff --git a/src/datapilot/core/platforms/dbt/insights/base.py b/src/datapilot/core/platforms/dbt/insights/base.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import ClassVar
+from typing import ClassVar, Optional
 from typing import Dict
 from typing import List
 from typing import Union
@@ -33,6 +33,7 @@ def __init__(
         macros: Dict[str, AltimateManifestMacroNode],
         children_map: Dict[str, List[str]],
         project_name: str,
+        adapter_type: Optional[str],
         selected_models: Union[List[str], None] = None,
         excluded_models: Union[List[str], None] = None,
         *args,
@@ -47,6 +48,7 @@ def __init__(
         self.seeds = seeds
         self.children_map = children_map
         self.project_name = project_name
+        self.adapter_type = adapter_type
         self.selected_models = selected_models
         self.excluded_models = excluded_models
         super().__init__(*args, **kwargs)
diff --git a/src/datapilot/core/platforms/dbt/insights/sql/__init__.py b/src/datapilot/core/platforms/dbt/insights/sql/__init__.py
diff --git a/src/datapilot/core/platforms/dbt/insights/sql/base.py b/src/datapilot/core/platforms/dbt/insights/sql/base.py
@@ -0,0 +1,23 @@
+from abc import abstractmethod
+from typing import Tuple
+
+from datapilot.core.platforms.dbt.insights.base import DBTInsight
+
+
+class SqlInsight(DBTInsight):
+    TYPE = "governance"
+
+    @abstractmethod
+    def generate(self, *args, **kwargs) -> dict:
+        pass
+
+    @classmethod
+    def has_all_required_data(cls, has_manifest: bool, **kwargs) -> Tuple[bool, str]:
+        """
+        Check if all required data is available for the insight to run.
+        :param has_manifest: A boolean indicating if manifest is available.
+        :return: A boolean indicating if all required data is available.
+        """
+        if not has_manifest:
+            return False, "manifest is required for insight to run."
+        return True, ""
diff --git a/src/datapilot/core/platforms/dbt/insights/sql/sql_check.py b/src/datapilot/core/platforms/dbt/insights/sql/sql_check.py
@@ -0,0 +1,120 @@
+import inspect
+from typing import List, Tuple
+
+from sqlglot import parse_one
+from sqlglot.optimizer.eliminate_ctes import eliminate_ctes
+from sqlglot.optimizer.eliminate_joins import eliminate_joins
+from sqlglot.optimizer.eliminate_subqueries import eliminate_subqueries
+from sqlglot.optimizer.normalize import normalize
+from sqlglot.optimizer.pushdown_projections import pushdown_projections
+from sqlglot.optimizer.qualify import qualify
+from sqlglot.optimizer.unnest_subqueries import unnest_subqueries
+
+from datapilot.core.insights.sql.base.insight import SqlInsight
+from datapilot.core.insights.utils import get_severity
+from datapilot.core.platforms.dbt.insights.schema import DBTInsightResult, DBTModelInsightResponse
+
+RULES = (
+    pushdown_projections,
+    normalize,
+    unnest_subqueries,
+    eliminate_subqueries,
+    eliminate_joins,
+    eliminate_ctes,
+)
+
+class SqlCheck(SqlInsight):
+    """
+    This class identifies DBT models with test coverage below a specified threshold.
+    It aims to ensure that a minimum percentage of tests are applied to each model to maintain data integrity.
+    """
+
+    NAME = "sql optimization issues"
+    ALIAS = "check_sql_optimization"
+    DESCRIPTION = "Checks if the model has SQL optimization issues. "
+    REASON_TO_FLAG = "The query can be optimized."
+    FAILURE_MESSAGE = "The query for model `{model_unique_id}` has optimization opportunities:\n{rule_name}. "
+    RECOMMENDATION = "Please adapt the query of the model `{model_unique_id}` as in following example:\n{optimized_sql}"
+
+    def _build_failure_result(self, model_unique_id: str, rule_name: str, optimized_sql: str) -> DBTInsightResult:
+        """
+        Constructs a failure result for a given model with low test coverage.
+        :param coverage: The calculated test coverage percentage for the model.
+        :param min_coverage: The minimum required test coverage percentage.
+        :return: An instance of DBTInsightResult containing failure details.
+        """
+        failure_message = self.FAILURE_MESSAGE.format(model_unique_id=model_unique_id, rule_name=rule_name)
+        recommendation = self.RECOMMENDATION.format(model_unique_id=model_unique_id, optimized_sql=optimized_sql)
+        return DBTInsightResult(
+            type=self.TYPE,
+            name=self.NAME,
+            message=failure_message,
+            recommendation=recommendation,
+            reason_to_flag=self.REASON_TO_FLAG,
+            metadata={"model_unique_id": model_unique_id, "rule_name": rule_name},
+        )
+
+    def generate(self, *args, **kwargs) -> List[DBTModelInsightResponse]:
+        """
+        Generates insights for each DBT model in the project, focusing on test coverage.
+
+        :return: A list of DBTModelInsightResponse objects with insights for each model.
+        """
+        self.logger.debug("Generating sql insights for DBT models")
+        insights = []
+
+        possible_kwargs = {
+            "db": None,
+            "catalog": None,
+            "dialect": self.adapter_type,
+            "isolate_tables": True,  # needed for other optimizations to perform well
+            "quote_identifiers": False,
+            **kwargs,
+        }
+        for node_id, node in self.nodes.items():
+            try:
+                compiled_query = node.compiled_code
+                if compiled_query:
+                    parsed_query = parse_one(compiled_query, dialect=self.adapter_type)
+                    qualified = qualify(parsed_query, **possible_kwargs)
+                    changed = qualified.copy()
+                    for rule in RULES:
+                        original = changed.copy()
+                        rule_params = inspect.getfullargspec(rule).args
+                        rule_kwargs = {
+                            param: possible_kwargs[param]
+                            for param in rule_params
+                            if param in possible_kwargs
+                        }
+                        changed = rule(changed, **rule_kwargs)
+                        if changed.sql() != original.sql():
+                            insights.append(
+                                DBTModelInsightResponse(
+                                    unique_id=node_id,
+                                    package_name=node.package_name,
+                                    path=node.original_file_path,
+                                    original_file_path=node.original_file_path,
+                                    insight=self._build_failure_result(
+                                        node_id,
+                                        rule.__name__,
+                                        changed.sql()
+                                    ),
+                                    severity=get_severity(self.config, self.ALIAS, self.DEFAULT_SEVERITY),
+                                )
+                            )
+            except Exception as e:
+                self.logger.error(e)
+        return insights
+
+    def parse_query(
+        query: str,
+        dialect: str = "snowflake",
+    ):
+        """
+        Parses the query and returns the parsed query object
+        """
+        try:
+            parsed = parse_one(query, read=dialect)
+        except Exception as e:
+            parsed = None
+        return parsed
diff --git a/src/datapilot/core/platforms/dbt/wrappers/manifest/v10/wrapper.py b/src/datapilot/core/platforms/dbt/wrappers/manifest/v10/wrapper.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, Optional
 from typing import Set
 
 from dbt_artifacts_parser.parsers.manifest.manifest_v10 import GenericTestNode
@@ -67,6 +67,7 @@ def _get_node(self, node: ManifestNode) -> AltimateManifestNode:
             depends_on_macros = node.depends_on.macros if node.depends_on else None
             compiled_path = node.compiled_path
             compiled = node.compiled
+            compiled_code = node.compiled_code
             raw_code = node.raw_code
             language = node.language
             contract = AltimateDBTContract(**node.contract.__dict__) if node.contract else None
@@ -381,6 +382,9 @@ def get_seeds(self) -> Dict[str, AltimateSeedNode]:
                 seeds[seed.unique_id] = self._get_seed(seed)
         return seeds
 
+    def get_adapter_type(self) -> Optional[str]:
+        return self.manifest.metadata.adapter_type
+
     def parent_to_child_map(self, nodes: Dict[str, AltimateManifestNode]) -> Dict[str, Set[str]]:
         """
         Current manifest contains information about parents
diff --git a/src/datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py b/src/datapilot/core/platforms/dbt/wrappers/manifest/v11/wrapper.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, Optional
 from typing import Set
 
 from dbt_artifacts_parser.parsers.manifest.manifest_v11 import GenericTestNode
@@ -67,6 +67,7 @@ def _get_node(self, node: ManifestNode) -> AltimateManifestNode:
             depends_on_macros = node.depends_on.macros if node.depends_on else None
             compiled_path = node.compiled_path
             compiled = node.compiled
+            compiled_code = node.compiled_code
             raw_code = node.raw_code
             language = node.language
             contract = AltimateDBTContract(**node.contract.__dict__) if node.contract else None
@@ -381,6 +382,9 @@ def get_seeds(self) -> Dict[str, AltimateSeedNode]:
                 seeds[seed.unique_id] = self._get_seed(seed)
         return seeds
 
+    def get_adapter_type(self) -> Optional[str]:
+        return self.manifest.metadata.adapter_type
+
     def parent_to_child_map(self, nodes: Dict[str, AltimateManifestNode]) -> Dict[str, Set[str]]:
         """
         Current manifest contains information about parents
diff --git a/src/datapilot/core/platforms/dbt/wrappers/manifest/v12/wrapper.py b/src/datapilot/core/platforms/dbt/wrappers/manifest/v12/wrapper.py
diff --git a/src/datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py b/src/datapilot/core/platforms/dbt/wrappers/manifest/wrapper.py