diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index f04aa3ba..de45fcf7 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -1,4 +1,4 @@ -FROM python:3.12.7-alpine3.20 +FROM python:3.12-alpine3.22 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 @@ -14,7 +14,7 @@ RUN apk update && apk upgrade && apk add --no-cache \ cmake \ musl-dev \ gfortran \ - linux-headers=6.6-r0 \ + linux-headers=6.14.2-r0 \ # Tools needed for installing the MSSQL ODBC drivers \ curl \ gpg \ @@ -25,7 +25,7 @@ RUN apk update && apk upgrade && apk add --no-cache \ unixodbc=2.3.12-r0 \ unixodbc-dev=2.3.12-r0 \ # Pinned versions for security - xz=5.6.2-r1 + xz=5.8.1-r0 RUN apk add --no-cache \ --repository https://dl-cdn.alpinelinux.org/alpine/v3.21/community \ @@ -48,7 +48,6 @@ RUN apk del \ cmake \ musl-dev \ gfortran \ - curl \ gpg \ linux-headers \ openblas-dev \ diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 58e15db3..4ff2ff94 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v7 +ARG TESTGEN_BASE_LABEL=v9 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image diff --git a/pyproject.toml b/pyproject.toml index 37d1e0f9..b7714a67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "4.32.5" +version = "4.38.3" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" }, @@ -60,6 +60,8 @@ dependencies = [ "streamlit-pydantic==0.6.0", "cron-converter==1.2.1", "cron-descriptor==2.0.5", + "pybars3==0.9.7", + "azure-identity==1.25.1", # Pinned to match the manually compiled libs or for security "pyarrow==18.1.0", diff --git a/testgen/__main__.py b/testgen/__main__.py index c0d7a7f9..e4871f0c 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -4,13 +4,12 @@ import subprocess import sys from dataclasses import dataclass, field +from datetime import UTC, datetime, timedelta import click from click.core import Context -from progress.spinner import MoonSpinner from testgen import settings -from testgen.commands.run_execute_tests import run_execution_steps from testgen.commands.run_generate_tests import run_test_gen_queries from testgen.commands.run_get_entities import ( run_get_results, @@ -29,8 +28,9 @@ ) from testgen.commands.run_launch_db_config import run_launch_db_config from testgen.commands.run_observability_exporter import run_observability_exporter -from testgen.commands.run_profiling_bridge import run_profiling_queries +from testgen.commands.run_profiling import run_profiling from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment +from testgen.commands.run_test_execution import run_test_execution from testgen.commands.run_test_metadata_exporter import run_test_metadata_exporter from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config from testgen.common import ( @@ -45,6 +45,7 @@ from testgen.common.models import with_database_session from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite from testgen.scheduler import register_scheduler_job, run_scheduler from testgen.utils import plugins @@ -114,20 +115,16 @@ def cli(ctx: Context, verbose: bool): @register_scheduler_job @cli.command("run-profile", help="Generates a new profile of the table group.") -@pass_configuration @click.option( "-tg", "--table-group-id", required=True, type=click.STRING, - help="The identifier for the table group used during a profile run. Use a table_group_id shown in list-table-groups.", + help="ID of the table group to profile. Use a table_group_id shown in list-table-groups.", ) -def run_profile(configuration: Configuration, table_group_id: str): +def run_profile(table_group_id: str): click.echo(f"run-profile with table_group_id: {table_group_id}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_profiling_queries(table_group_id, spinner=spinner) + message = run_profiling(table_group_id) click.echo("\n" + message) @@ -163,10 +160,17 @@ def run_test_generation(configuration: Configuration, table_group_id: str, test_ @register_scheduler_job @cli.command("run-tests", help="Performs tests defined for a test suite.") +@click.option( + "-t", + "--test-suite-id", + required=False, + type=click.STRING, + help="ID of the test suite to run. Use a test_suite_id shown in list-test-suites.", +) @click.option( "-pk", "--project-key", - help="The identifier for a TestGen project. Use a project_key shown in list-projects.", + help="DEPRECATED. Use --test-suite-id instead.", required=False, type=click.STRING, default=settings.PROJECT_KEY, @@ -174,17 +178,22 @@ def run_test_generation(configuration: Configuration, table_group_id: str, test_ @click.option( "-ts", "--test-suite-key", - help="The identifier for a test suite. Use a test_suite_key shown in list-test-suites.", + help="DEPRECATED. Use --test-suite-id instead.", required=False, default=settings.DEFAULT_TEST_SUITE_KEY, ) -@pass_configuration -def run_tests(configuration: Configuration, project_key: str, test_suite_key: str): - click.echo(f"run-tests for suite: {test_suite_key}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_execution_steps(project_key, test_suite_key, spinner=spinner) +@with_database_session +def run_tests(test_suite_id: str | None = None, project_key: str | None = None, test_suite_key: str | None = None): + click.echo(f"run-tests for suite: {test_suite_id or test_suite_key}") + # For backward compatibility + if not test_suite_id: + test_suites = TestSuite.select_minimal_where( + TestSuite.project_code == project_key, + TestSuite.test_suite == test_suite_key, + ) + if test_suites: + test_suite_id = test_suites[0].id + message = run_test_execution(test_suite_id) click.echo("\n" + message) @@ -370,27 +379,27 @@ def quick_start( click.echo("loading initial data") run_quick_start_increment(0) - minutes_offset = -30*24*60 # 1 month ago - table_group_id="0ea85e17-acbe-47fe-8394-9970725ad37d" + now_date = datetime.now(UTC) + time_delta = timedelta(days=-30) # 1 month ago + table_group_id = "0ea85e17-acbe-47fe-8394-9970725ad37d" + test_suite_id = "9df7489d-92b3-49f9-95ca-512160d7896f" click.echo(f"run-profile with table_group_id: {table_group_id}") - spinner = None - if not configuration.verbose: - spinner = MoonSpinner("Processing ... ") - message = run_profiling_queries(table_group_id, spinner=spinner, minutes_offset=minutes_offset) + message = run_profiling(table_group_id, run_date=now_date + time_delta) click.echo("\n" + message) LOG.info(f"run-test-generation with table_group_id: {table_group_id} test_suite: {settings.DEFAULT_TEST_SUITE_KEY}") message = run_test_gen_queries(table_group_id, settings.DEFAULT_TEST_SUITE_KEY) click.echo("\n" + message) - run_execution_steps(settings.PROJECT_KEY, settings.DEFAULT_TEST_SUITE_KEY, minutes_offset=minutes_offset) + run_test_execution(test_suite_id, run_date=now_date + time_delta) - for iteration in range(1, 4): - click.echo(f"Running iteration: {iteration} / 3") - minutes_offset = -10*24*60 * (3-iteration) + total_iterations = 3 + for iteration in range(1, total_iterations + 1): + click.echo(f"Running iteration: {iteration} / {total_iterations}") + run_date = now_date + timedelta(days=-10 * (total_iterations - iteration)) # 10 day increments run_quick_start_increment(iteration) - run_execution_steps(settings.PROJECT_KEY, settings.DEFAULT_TEST_SUITE_KEY, minutes_offset=minutes_offset) + run_test_execution(test_suite_id, run_date=run_date) click.echo("Quick start has successfully finished.") diff --git a/testgen/commands/queries/contingency_query.py b/testgen/commands/queries/contingency_query.py new file mode 100644 index 00000000..0a8437c7 --- /dev/null +++ b/testgen/commands/queries/contingency_query.py @@ -0,0 +1,51 @@ +# UNUSED CODE - TO BE REVIVED LATER + +import dataclasses +from uuid import UUID + +from testgen.common import read_template_sql_file +from testgen.common.database.database_service import quote_csv_items, replace_params + + +@dataclasses.dataclass +class ContingencyTable: + schema_name: str + table_name: str + contingency_columns: str + + +class ContingencySQL: + + contingency_max_values = 6 + + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "contingency", + params: dict | None = None, + ) -> tuple[str | None, dict]: + query = read_template_sql_file(template_file_name, sub_directory) + query = replace_params(query, params or {}) + + return query, params + + def get_contingency_columns(self, profiling_run_id: UUID) -> tuple[str, dict]: + # Runs on App database + return self._get_query( + "contingency_columns.sql", + params={ + "PROFILE_RUN_ID": profiling_run_id, + "CONTINGENCY_MAX_VALUES": self.contingency_max_values, + }, + ) + + def get_contingency_counts(self, contingency_table: ContingencyTable) -> tuple[str, dict]: + # Runs on Target database + return self._get_query( + "contingency_counts.sql", + params={ + "DATA_SCHEMA": contingency_table.schema_name, + "DATA_TABLE": contingency_table.table_name, + "CONTINGENCY_COLUMNS": quote_csv_items(contingency_table.contingency_columns), + }, + ) diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py deleted file mode 100644 index 5f70a59d..00000000 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import ClassVar, TypedDict - -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL -from testgen.common import date_service, read_template_sql_file -from testgen.common.database.database_service import get_flavor_service, replace_params -from testgen.common.read_file import replace_templated_functions - - -class CATTestParams(TypedDict): - schema_name: str - table_name: str - cat_sequence: int - test_measures: str - test_conditions: str - - -class CCATExecutionSQL: - project_code = "" - flavor = "" - test_suite = "" - run_date = "" - test_run_id = "" - table_groups_id = "" - max_query_chars = "" - exception_message = "" - target_schema = "" - target_table = "" - cat_test_params: ClassVar[CATTestParams] = {} - - _rollup_scores_sql: CRollupScoresSQL = None - - def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, max_query_chars, minutes_offset=0): - # Defaults - self.test_suite_id = strTestSuiteId - self.test_suite = strTestSuite - self.project_code = strProjectCode - self.flavor_service = get_flavor_service(strSQLFlavor) - self.flavor = strSQLFlavor - self.max_query_chars = max_query_chars - self.today = date_service.get_now_as_string_with_offset(minutes_offset) - self.minutes_offset = minutes_offset - - def _get_rollup_scores_sql(self) -> CRollupScoresSQL: - if not self._rollup_scores_sql: - self._rollup_scores_sql = CRollupScoresSQL(self.test_run_id, self.table_groups_id) - - return self._rollup_scores_sql - - def _get_query(self, template_file_name: str, sub_directory: str | None = "exec_cat_tests", no_bind: bool = False) -> tuple[str, dict | None]: - query = read_template_sql_file(template_file_name, sub_directory) - params = { - "MAX_QUERY_CHARS": self.max_query_chars, - "TEST_RUN_ID": self.test_run_id, - "PROJECT_CODE": self.project_code, - "TEST_SUITE": self.test_suite, - "TEST_SUITE_ID": self.test_suite_id, - "TABLE_GROUPS_ID": self.table_groups_id, - "SQL_FLAVOR": self.flavor, - "QUOTE": self.flavor_service.quote_character, - "VARCHAR_TYPE": self.flavor_service.varchar_type, - "CONCAT_OPERATOR": self.flavor_service.concat_operator, - "SCHEMA_NAME": self.target_schema, - "TABLE_NAME": self.target_table, - "NOW_DATE": "GETDATE()", - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset), - "EXCEPTION_MESSAGE": self.exception_message.strip(), - **{key.upper(): value for key, value in self.cat_test_params.items()}, - # This has to be replaced at the end - "RUN_DATE": self.run_date, - } - query = replace_params(query, params) - query = replace_templated_functions(query, self.flavor) - - if no_bind: - # Adding escape character where ':' is referenced - query = query.replace(":", "\\:") - - return query, None if no_bind else params - - def GetDistinctTablesSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_get_distinct_tables.sql") - - def GetAggregateTableTestSQL(self) -> tuple[str, None]: - # Runs on App database - return self._get_query("ex_cat_build_agg_table_tests.sql", no_bind=True) - - def GetAggregateTestParmsSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_retrieve_agg_test_parms.sql") - - def PrepCATQuerySQL(self) -> tuple[str, None]: - # Runs on Target database - return self._get_query("ex_cat_test_query.sql", no_bind=True) - - def GetCATResultsParseSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_cat_results_parse.sql") - - def FinalizeTestResultsSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_finalize_test_run_results.sql", "execution") - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql", "execution") - - def FinalizeTestSuiteUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_suite.sql", "execution") - - def CalcPrevalenceTestResultsSQL(self) -> tuple[str, None]: - # Runs on App database - return self._get_query("ex_calc_prevalence_test_results.sql", "execution", no_bind=True) - - def TestScoringRollupRunSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresTestRunQuery() - - def TestScoringRollupTableGroupSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresTestTableGroupQuery() diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 65679ad3..6caa1567 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,19 +1,21 @@ -from typing import ClassVar, TypedDict +import dataclasses +from collections.abc import Iterable +from datetime import datetime +from typing import TypedDict +from uuid import UUID -from testgen.common import date_service, read_template_sql_file -from testgen.common.clean_sql import CleanSQL, ConcatColumnList, quote_identifiers +from testgen.common import read_template_sql_file +from testgen.common.clean_sql import concat_columns from testgen.common.database.database_service import get_flavor_service, replace_params +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_definition import TestRunType, TestScope +from testgen.common.models.test_run import TestRun +from testgen.common.read_file import replace_templated_functions -class TestParams(TypedDict): - test_type: str - test_definition_id: str - test_description: str - test_action: str - schema_name: str - table_name: str - column_name: str - skip_errors: str +@dataclasses.dataclass +class InputParameters: baseline_ct: str baseline_unique_ct: str baseline_value: str @@ -35,139 +37,302 @@ class TestParams(TypedDict): match_subset_condition: str match_groupby_names: str match_having_condition: str + +@dataclasses.dataclass +class TestExecutionDef(InputParameters): + id: UUID + test_type: str + schema_name: str + table_name: str + column_name: str + skip_errors: int custom_query: str + run_type: TestRunType + test_scope: TestScope template_name: str + measure: str + test_operator: str + test_condition: str + # Runtime attributes + column_type: str = None + measure_expression: str = None + condition_expression: str = None + errors: list[str] = dataclasses.field(default_factory=list) +class AggregateResult(TypedDict): + query_index: int + result_measures: str + result_codes: str -class CTestExecutionSQL: - flavor = "" - run_date = "" - project_code = "" - test_suite_id = "" - test_suite = "" - test_run_id = "" - exception_message = "" - process_id = "" - test_params: ClassVar[TestParams] = {} - - _use_clean = False - - def __init__(self, strProjectCode, strFlavor, strTestSuiteId, strTestSuite, minutes_offset=0): - self.project_code = strProjectCode - self.flavor = strFlavor - self.flavor_service = get_flavor_service(strFlavor) - self.test_suite_id = strTestSuiteId - self.test_suite = strTestSuite - self.today = date_service.get_now_as_string_with_offset(minutes_offset) - self.minutes_offset = minutes_offset - - def _get_input_parameters(self): - param_keys = [ - "column_name", - "skip_errors", - "baseline_ct", - "baseline_unique_ct", - "baseline_value", - "baseline_value_ct", - "baseline_sum", - "baseline_avg", - "baseline_sd", - "lower_tolerance", - "upper_tolerance", - "subset_condition", - "groupby_names", - "having_condition", - "window_date_column", - "window_days", - "match_column_names", - "match_subset_condition", - "match_schema_name", - "match_table_name", - "match_groupby_names", - "match_having_condition", - ] - input_parameters = "; ".join( - f"{key}={self.test_params[key]}" - for key in param_keys - if key.lower() in self.test_params and self.test_params[key] not in [None, ""] - ) - return input_parameters.replace("'", "`") - def _get_query( - self, template_file_name: str, sub_directory: str | None = "execution", no_bind: bool = False - ) -> tuple[str, dict | None]: - query = read_template_sql_file(template_file_name, sub_directory) +class TestExecutionSQL: + + null_value = "" + test_results_table = "test_results" + result_columns = ( + "test_run_id", + "test_suite_id", + "test_time", + "test_definition_id", + "test_type", + "schema_name", + "table_name", + "column_names", + "skip_errors", + "input_parameters", + "result_code", + "result_status", + "result_message", + "result_measure", + ) + + def __init__(self, connection: Connection, table_group: TableGroup, test_run: TestRun): + self.connection = connection + self.table_group = table_group + self.test_run = test_run + self.run_date = test_run.test_starttime.strftime("%Y-%m-%d %H:%M:%S") + self.flavor = connection.sql_flavor + self.flavor_service = get_flavor_service(self.flavor) + + def _get_input_parameters(self, test_def: TestExecutionDef) -> str: + return "; ".join( + f"{field.name}={getattr(test_def, field.name)}" + for field in dataclasses.fields(InputParameters) + if getattr(test_def, field.name, None) not in [None, ""] + ).replace("'", "`") + + def _get_params(self, test_def: TestExecutionDef | None = None) -> dict: + quote = self.flavor_service.quote_character params = { - "PROJECT_CODE": self.project_code, - "TEST_SUITE_ID": self.test_suite_id, - "TEST_SUITE": self.test_suite, - "SQL_FLAVOR": self.flavor, - "QUOTE": self.flavor_service.quote_character, - "TEST_RUN_ID": self.test_run_id, - "INPUT_PARAMETERS": self._get_input_parameters(), + "TEST_SUITE_ID": self.test_run.test_suite_id, + "TEST_RUN_ID": self.test_run.id, "RUN_DATE": self.run_date, - "EXCEPTION_MESSAGE": self.exception_message, - "START_TIME": self.today, - "PROCESS_ID": self.process_id, + "SQL_FLAVOR": self.flavor, "VARCHAR_TYPE": self.flavor_service.varchar_type, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(self.minutes_offset), - **{key.upper(): value or "" for key, value in self.test_params.items()}, + "QUOTE": quote, } - if self.test_params: - column_name = self.test_params["column_name"] - params["COLUMN_NAME"] = quote_identifiers(column_name, self.flavor) if column_name else "" - # Shows contents without double-quotes for display and aggregate expressions - params["COLUMN_NAME_NO_QUOTES"] = column_name or "" - # Concatenates column list into single expression for relative entropy - params["CONCAT_COLUMNS"] = ConcatColumnList(column_name, "") if column_name else "" - - match_groupby_names = self.test_params["match_groupby_names"] - # Concatenates column list into single expression for relative entropy - params["CONCAT_MATCH_GROUPBY"] = ( - ConcatColumnList(match_groupby_names, "") if match_groupby_names else "" - ) + if test_def: + params.update({ + "TEST_TYPE": test_def.test_type, + "TEST_DEFINITION_ID": test_def.id, + "SCHEMA_NAME": test_def.schema_name, + "TABLE_NAME": test_def.table_name, + "COLUMN_NAME": f"{quote}{test_def.column_name or ''}{quote}", + "COLUMN_NAME_NO_QUOTES": test_def.column_name, + "CONCAT_COLUMNS": concat_columns(test_def.column_name, self.null_value) if test_def.column_name else "", + "SKIP_ERRORS": test_def.skip_errors or 0, + "BASELINE_CT": test_def.baseline_ct, + "BASELINE_UNIQUE_CT": test_def.baseline_unique_ct, + "BASELINE_VALUE": test_def.baseline_value, + "BASELINE_VALUE_CT": test_def.baseline_value_ct, + "THRESHOLD_VALUE": test_def.threshold_value, + "BASELINE_SUM": test_def.baseline_sum, + "BASELINE_AVG": test_def.baseline_avg, + "BASELINE_SD": test_def.baseline_sd, + "LOWER_TOLERANCE": test_def.lower_tolerance, + "UPPER_TOLERANCE": test_def.upper_tolerance, + "SUBSET_CONDITION": test_def.subset_condition or "1=1", + "GROUPBY_NAMES": test_def.groupby_names, + "HAVING_CONDITION": f"HAVING {test_def.having_condition}" if test_def.having_condition else "", + "WINDOW_DATE_COLUMN": test_def.window_date_column, + "WINDOW_DAYS": test_def.window_days or 0, + "MATCH_SCHEMA_NAME": test_def.match_schema_name, + "MATCH_TABLE_NAME": test_def.match_table_name, + "MATCH_COLUMN_NAMES": test_def.match_column_names, + "MATCH_SUBSET_CONDITION": test_def.match_subset_condition or "1=1", + "MATCH_GROUPBY_NAMES": test_def.match_groupby_names, + "CONCAT_MATCH_GROUPBY": concat_columns(test_def.match_groupby_names, self.null_value) if test_def.match_groupby_names else "", + "MATCH_HAVING_CONDITION": f"HAVING {test_def.match_having_condition}" if test_def.match_having_condition else "", + "CUSTOM_QUERY": test_def.custom_query, + "COLUMN_TYPE": test_def.column_type, + "INPUT_PARAMETERS": self._get_input_parameters(test_def), + }) + return params - subset_condition = self.test_params["subset_condition"] - params["SUBSET_DISPLAY"] = subset_condition.replace( - "'", self.flavor_service.escaped_single_quote - ) if subset_condition else "" + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "execution", + no_bind: bool = False, + extra_params: dict | None = None, + test_def: TestExecutionDef | None = None, + ) -> tuple[str, dict | None]: + query = read_template_sql_file(template_file_name, sub_directory) + params = self._get_params(test_def) + if extra_params: + params.update(extra_params) query = replace_params(query, params) if no_bind: - # Adding escape character where ':' is referenced query = query.replace(":", "\\:") return query, None if no_bind else params - - def GetTestsNonCAT(self) -> tuple[str, dict]: + + def get_active_test_definitions(self) -> tuple[dict]: # Runs on App database - query, params = self._get_query("ex_get_tests_non_cat.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def GetHistoricThresholdUpdate(self) -> tuple[str, dict]: - query, params = self._get_query("ex_update_history_threshold_last_n.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: + return self._get_query("get_active_test_definitions.sql") + + def get_target_identifiers(self, schemas: Iterable[str]) -> tuple[str, dict]: + # Runs on Target database + filename = "get_target_identifiers.sql" + params = { + "DATA_SCHEMA": self.table_group.table_group_schema, + "TEST_SCHEMAS": ", ".join([f"'{item}'" for item in schemas]), + } + try: + return self._get_query(filename, f"flavors/{self.connection.sql_flavor}/validate_tests", extra_params=params) + except ModuleNotFoundError: + return self._get_query(filename, "flavors/generic/validate_tests", extra_params=params) + + def get_test_errors(self, test_defs: list[TestExecutionDef]) -> list[list[UUID | str | datetime]]: + return [ + [ + self.test_run.id, + self.test_run.test_suite_id, + self.test_run.test_starttime, + td.id, + td.test_type, + td.schema_name, + td.table_name, + td.column_name, + td.skip_errors or 0, + self._get_input_parameters(td), + None, # No result_code on errors + "Error", + ". ".join(td.errors), + None, # No result_measure on errors + ] for td in test_defs if td.errors + ] + + def disable_invalid_test_definitions(self) -> tuple[str, dict]: # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql") - - def GetTestQuery(self) -> tuple[str, None]: + return self._get_query("disable_invalid_test_definitions.sql") + + def update_historic_thresholds(self) -> tuple[str, dict]: + # Runs on App database + return self._get_query("update_historic_thresholds.sql") + + def run_query_test(self, test_def: TestExecutionDef) -> tuple[str, dict]: # Runs on Target database - if template_name := self.test_params["template_name"]: - template_flavor = "generic" if template_name.endswith("_generic.sql") else self.flavor - query, params = self._get_query(template_name, f"flavors/{template_flavor}/exec_query_tests", no_bind=True) - # Final replace to cover parm within CUSTOM_QUERY parm - query = replace_params(query, {"DATA_SCHEMA": self.test_params["schema_name"]}) - - if self._use_clean: - query = CleanSQL(query) - return query, params + folder = "generic" if test_def.template_name.endswith("_generic.sql") else self.flavor + return self._get_query( + test_def.template_name, + f"flavors/{folder}/exec_query_tests", + no_bind=True, + # Final replace in CUSTOM_QUERY + extra_params={"DATA_SCHEMA": test_def.schema_name}, + test_def=test_def, + ) + + def aggregate_cat_tests( + self, + test_defs: list[TestExecutionDef], + single: bool = False, + ) -> tuple[list[tuple[str, None]], list[list[TestExecutionDef]]]: + varchar_type = self.flavor_service.varchar_type + concat_operator = self.flavor_service.concat_operator + quote = self.flavor_service.quote_character + + for td in test_defs: + # Don't recalculate expressions if it was already done before + if not td.measure_expression or not td.condition_expression: + params = self._get_params(td) + + measure = replace_params(td.measure, params) + measure = replace_templated_functions(measure, self.flavor) + td.measure_expression = f"COALESCE(CAST({measure} AS {varchar_type}) {concat_operator} '|', '{self.null_value}|')" + + condition = replace_params(f"{td.measure}{td.test_operator}{td.test_condition}", params) + condition = replace_templated_functions(condition, self.flavor) + td.condition_expression = f"CASE WHEN {condition} THEN '0,' ELSE '1,' END" + + aggregate_queries: list[tuple[str, None]] = [] + aggregate_test_defs: list[list[TestExecutionDef]] = [] + + def add_query(test_defs: list[TestExecutionDef]) -> str: + if not test_defs: + return + + query = ( + f"SELECT {len(aggregate_queries)} AS query_index, " + f"{concat_operator.join([td.measure_expression for td in test_defs])} AS result_measures, " + f"{concat_operator.join([td.condition_expression for td in test_defs])} AS result_codes " + f"FROM {quote}{test_defs[0].schema_name}{quote}.{quote}{test_defs[0].table_name}{quote}" + ) + query = query.replace(":", "\\:") + + aggregate_queries.append((query, None)) + aggregate_test_defs.append(test_defs) + + if single: + for td in test_defs: + # Add separate query for each test + add_query([td]) else: - raise ValueError(f"No query template assigned to test_type {self.test_params["test_type"]}") + test_defs_by_table: dict[tuple[str, str], list[TestExecutionDef]] = {} + for td in test_defs: + table = (td.schema_name, td.table_name) + if not test_defs_by_table.get(table): + test_defs_by_table[table] = [] + test_defs_by_table[table].append(td) + + max_query_chars = self.connection.max_query_chars - 400 + for test_defs in test_defs_by_table.values(): + # Add new query for each table + current_chars = 0 + current_test_defs = [] + + for td in test_defs: + td_chars = len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) + # Add new query if current query will become bigger than character limit + if (current_chars + td_chars) > max_query_chars: + add_query(current_test_defs) + current_chars = 0 + current_test_defs = [] + + current_chars += td_chars + current_test_defs.append(td) + + add_query(current_test_defs) + + return aggregate_queries, aggregate_test_defs + + def get_cat_test_results( + self, + aggregate_results: list[AggregateResult], + aggregate_test_defs: list[list[TestExecutionDef]], + ) -> list[list[UUID | str | datetime | int | None]]: + test_results: list[list[UUID | str | datetime | int | None]] = [] + for result in aggregate_results: + test_defs = aggregate_test_defs[result["query_index"]] + result_measures = result["result_measures"].split("|") + result_codes = result["result_codes"].split(",") + + for index, td in enumerate(test_defs): + test_results.append([ + self.test_run.id, + self.test_run.test_suite_id, + self.test_run.test_starttime, + td.id, + td.test_type, + td.schema_name, + td.table_name, + td.column_name, + td.skip_errors or 0, + self._get_input_parameters(td), + result_codes[index], + None, # result_status will be calculated later + None, # No result_message + result_measures[index] if result_measures[index] != self.null_value else None, + ]) + + return test_results + + def update_test_results(self) -> list[tuple[str, dict]]: + # Runs on App database + return [ + self._get_query("update_test_results.sql"), + self._get_query("update_test_run_stats.sql"), + ] diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index bf23b7bf..cece2d3e 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -1,7 +1,8 @@ import logging +from datetime import UTC, datetime from typing import ClassVar, TypedDict -from testgen.common import CleanSQL, date_service, read_template_sql_file +from testgen.common import CleanSQL, read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.read_file import get_template_files @@ -33,7 +34,7 @@ def __init__(self, flavor): self.sql_flavor = flavor self.flavor_service = get_flavor_service(flavor) - today = date_service.get_now_as_string() + today = datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S") self.run_date = today self.as_of_date = today diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 93dbe03d..a5fd7ba0 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -1,158 +1,133 @@ +import dataclasses import re -import typing - -from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL -from testgen.common import date_service, read_template_sql_file, read_template_yaml_file -from testgen.common.database.database_service import get_flavor_service, replace_params +from uuid import UUID + +from testgen.commands.queries.refresh_data_chars_query import ColumnChars +from testgen.common import read_template_sql_file, read_template_yaml_file +from testgen.common.database.database_service import replace_params +from testgen.common.models.connection import Connection +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup from testgen.common.read_file import replace_templated_functions -class CProfilingSQL: - dctSnippetTemplate: typing.ClassVar = {} - - project_code = "" - connection_id = "" - table_groups_id = "" - flavor = "" - run_date = "" - data_schema = "" - data_table = "" - - col_name = "" - col_gen_type = "" - col_type = "" - db_data_type = "" - col_ordinal_position = "0" - col_is_decimal = "" - col_top_freq_update = "" - - parm_table_set = None - parm_table_include_mask = None - parm_table_exclude_mask = None - parm_do_patterns = "Y" - parm_max_pattern_length = 25 - parm_do_freqs = "Y" - parm_do_sample = "N" - parm_sample_size = 0 - profile_run_id = "" - profile_id_column_mask = "" - profile_sk_column_mask = "" - profile_use_sampling = "" - profile_flag_cdes = False - profile_sample_percent = "" - profile_sample_min_count = "" - - sampling_table = "" - sample_ratio = "" - sample_percent_calc = "" - - process_id = None - - contingency_max_values = "4" - contingency_columns = "" - - exception_message = "" - minutes_offset = 0 - - _data_chars_sql: CRefreshDataCharsSQL = None - _rollup_scores_sql: CRollupScoresSQL = None - - def __init__(self, strProjectCode, flavor, minutes_offset=0): - self.flavor = flavor - self.project_code = strProjectCode - # Defaults - self.run_date = date_service.get_now_as_string_with_offset(minutes_offset) - self.today = date_service.get_now_as_string_with_offset(minutes_offset) - self.minutes_offset = minutes_offset - - def _get_data_chars_sql(self) -> CRefreshDataCharsSQL: - if not self._data_chars_sql: - params = { - "project_code": self.project_code, - "sql_flavor": self.flavor, - "table_group_schema": self.data_schema, - "table_groups_id": self.table_groups_id, - "max_query_chars": None, - "profiling_table_set": self.parm_table_set, - "profiling_include_mask": self.parm_table_include_mask, - "profiling_exclude_mask": self.parm_table_exclude_mask, - } - self._data_chars_sql = CRefreshDataCharsSQL(params, self.run_date, "v_latest_profile_results") - - return self._data_chars_sql - - def _get_rollup_scores_sql(self) -> CRollupScoresSQL: - if not self._rollup_scores_sql: - self._rollup_scores_sql = CRollupScoresSQL(self.profile_run_id, self.table_groups_id) - - return self._rollup_scores_sql - - def _get_params(self) -> dict: - return { - "PROJECT_CODE": self.project_code, - "CONNECTION_ID": self.connection_id, - "TABLE_GROUPS_ID": self.table_groups_id, +@dataclasses.dataclass +class TableSampling: + table_name: str + sample_count: int + sample_ratio: float + sample_percent: float + + +@dataclasses.dataclass +class HygieneIssueType: + id: str + anomaly_type: str + data_object: str + anomaly_criteria: str + detail_expression: str + dq_score_prevalence_formula: str + dq_score_risk_factor: str + + +class ProfilingSQL: + + profiling_results_table = "profile_results" + frequency_staging_table = "stg_secondary_profile_updates" + error_columns = ( + "project_code", + "connection_id", + "table_groups_id", + "schema_name", + "profile_run_id", + "run_date", + "table_name", + "column_name", + "position", + "column_type", + "general_type", + "db_data_type", + "record_ct", + "query_error", + ) + + max_pattern_length = 25 + max_error_length = 2000 + + def __init__(self, connection: Connection, table_group: TableGroup, profiling_run: ProfilingRun): + self.connection = connection + self.table_group = table_group + self.profiling_run = profiling_run + self.run_date = profiling_run.profiling_starttime.strftime("%Y-%m-%d %H:%M:%S") + self.flavor = connection.sql_flavor + self._profiling_template: dict = None + + def _get_params(self, column_chars: ColumnChars | None = None, table_sampling: TableSampling | None = None) -> dict: + params = { + "PROJECT_CODE": self.table_group.project_code, + "CONNECTION_ID": self.connection.connection_id, + "TABLE_GROUPS_ID": self.table_group.id, + "PROFILE_RUN_ID": self.profiling_run.id, "RUN_DATE": self.run_date, - "DATA_SCHEMA": self.data_schema, - "DATA_TABLE": self.data_table, - "COL_NAME": self.col_name, - "COL_NAME_SANITIZED": self.col_name.replace("'", "''"), - "COL_GEN_TYPE": self.col_gen_type, - "COL_TYPE": self.col_type or "", - "DB_DATA_TYPE": self.db_data_type or "", - "COL_POS": self.col_ordinal_position, - "TOP_FREQ": self.col_top_freq_update, - "PROFILE_RUN_ID": self.profile_run_id, - "PROFILE_ID_COLUMN_MASK": self.profile_id_column_mask, - "PROFILE_SK_COLUMN_MASK": self.profile_sk_column_mask, - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string_with_offset(minutes_offset=self.minutes_offset), - "EXCEPTION_MESSAGE": self.exception_message, - "SAMPLING_TABLE": self.sampling_table, - "SAMPLE_SIZE": int(self.parm_sample_size), - "PROFILE_USE_SAMPLING": self.profile_use_sampling, - "PROFILE_SAMPLE_PERCENT": self.profile_sample_percent, - "PROFILE_SAMPLE_MIN_COUNT": self.profile_sample_min_count, - "PROFILE_SAMPLE_RATIO": self.sample_ratio, - "SAMPLE_PERCENT_CALC": self.sample_percent_calc, - "PARM_MAX_PATTERN_LENGTH": self.parm_max_pattern_length, - "CONTINGENCY_COLUMNS": self.contingency_columns, - "CONTINGENCY_MAX_VALUES": self.contingency_max_values, - "PROCESS_ID": self.process_id, "SQL_FLAVOR": self.flavor, - "QUOTE": get_flavor_service(self.flavor).quote_character + "DATA_SCHEMA": self.table_group.table_group_schema, + "PROFILE_ID_COLUMN_MASK": self.table_group.profile_id_column_mask, + "PROFILE_SK_COLUMN_MASK": self.table_group.profile_sk_column_mask, + "MAX_PATTERN_LENGTH": self.max_pattern_length, } + if column_chars: + params.update({ + "DATA_TABLE": column_chars.table_name, + "COL_NAME": column_chars.column_name, + "COL_NAME_SANITIZED": column_chars.column_name.replace("'", "''"), + "COL_GEN_TYPE": column_chars.general_type, + "COL_TYPE": column_chars.column_type, + "DB_DATA_TYPE": column_chars.db_data_type, + "COL_POS": column_chars.ordinal_position, + }) + if table_sampling: + params.update({ + "SAMPLING_TABLE": table_sampling.table_name, + "SAMPLE_SIZE": table_sampling.sample_count, + "PROFILE_SAMPLE_RATIO": table_sampling.sample_ratio, + "SAMPLE_PERCENT_CALC": table_sampling.sample_percent, + }) + return params def _get_query( self, template_file_name: str, sub_directory: str | None = "profiling", extra_params: dict | None = None, + column_chars: ColumnChars | None = None, + table_sampling: TableSampling | None = None, ) -> tuple[str | None, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = {} if query: - query = self._process_conditionals(query) + query = self._process_conditionals(query, extra_params) + params.update(self._get_params(column_chars, table_sampling)) if extra_params: params.update(extra_params) - params.update(self._get_params()) query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) return query, params - def _process_conditionals(self, query: str): + def _process_conditionals(self, query: str, extra_params: dict | None = None) -> str: re_pattern = re.compile(r"^--\s+TG-(IF|ELSE|ENDIF)(?:\s+(\w+))?\s*$") condition = None updated_query = [] for line in query.splitlines(True): if re_match := re_pattern.match(line): match re_match.group(1): - case "IF" if condition is None and re_match.group(2) is not None: - condition = bool(getattr(self, re_match.group(2))) + case "IF" if condition is None and (variable := re_match.group(2)) is not None: + result = extra_params.get(variable) + if result is None: + result = getattr(self, variable, None) + condition = bool(result) case "ELSE" if condition is not None: condition = not condition case "ENDIF" if condition is not None: @@ -167,67 +142,55 @@ def _process_conditionals(self, query: str): return "".join(updated_query) - @property - def do_sample_bool(self): - return self.parm_do_sample == "Y" + def _get_profiling_template(self) -> dict: + if not self._profiling_template: + self._profiling_template = read_template_yaml_file( + "project_profiling_query.yaml", + sub_directory=f"flavors/{self.flavor}/profiling", + ) + return self._profiling_template - def GetSecondProfilingColumnsQuery(self) -> tuple[str, dict]: + def get_frequency_analysis_columns(self) -> tuple[str, dict]: # Runs on App database return self._get_query("secondary_profiling_columns.sql") - def GetSecondProfilingUpdateQuery(self) -> tuple[str, dict]: + def update_frequency_analysis_results(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("secondary_profiling_update.sql") + return [ + self._get_query("secondary_profiling_update.sql"), + self._get_query("secondary_profiling_delete.sql"), + ] - def GetSecondProfilingStageDeleteQuery(self) -> tuple[str, dict]: + def update_profiling_results(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("secondary_profiling_delete.sql") - - def GetDataTypeSuggestionUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("datatype_suggestions.sql") - - def GetFunctionalDataTypeUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("functional_datatype.sql") - - def GetFunctionalTableTypeStageQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("functional_tabletype_stage.sql") - - def GetFunctionalTableTypeUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("functional_tabletype_update.sql") - - def GetPIIFlagUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("pii_flag.sql") - - def GetAnomalyStatsRefreshQuery(self) -> tuple[str, dict]: + queries = [ + self._get_query("datatype_suggestions.sql"), + self._get_query("functional_datatype.sql"), + self._get_query("functional_tabletype_stage.sql"), + self._get_query("functional_tabletype_update.sql"), + self._get_query("pii_flag.sql"), + ] + if self.table_group.profile_flag_cdes: + queries.append(self._get_query("cde_flagger_query.sql")) + return queries + + def update_hygiene_issue_counts(self) -> tuple[str, dict]: # Runs on App database return self._get_query("refresh_anomalies.sql") - def GetAnomalyScoringRollupRunQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresProfileRunQuery() - - def GetAnomalyScoringRollupTableGroupQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_rollup_scores_sql().GetRollupScoresProfileTableGroupQuery() - - def GetAnomalyTestTypesQuery(self) -> tuple[str, dict]: + def get_hygiene_issue_types(self) -> tuple[str, dict]: # Runs on App database return self._get_query("profile_anomaly_types_get.sql") - def GetAnomalyTestQuery(self, test_type: dict) -> tuple[str, dict] | None: + def detect_hygiene_issue(self, issue_type: HygieneIssueType) -> tuple[str, dict] | None: # Runs on App database extra_params = { - "ANOMALY_ID": test_type["id"], - "DETAIL_EXPRESSION": test_type["detail_expression"], - "ANOMALY_CRITERIA": test_type["anomaly_criteria"], + "ANOMALY_ID": issue_type.id, + "DETAIL_EXPRESSION": issue_type.detail_expression, + "ANOMALY_CRITERIA": issue_type.anomaly_criteria, } - match test_type["data_object"]: + match issue_type.data_object: case "Column": query, params = self._get_query("profile_anomalies_screen_column.sql", extra_params=extra_params) case "Multi-Col": @@ -243,157 +206,87 @@ def GetAnomalyTestQuery(self, test_type: dict) -> tuple[str, dict] | None: return query, params - def GetAnomalyScoringQuery(self, test_type: dict) -> tuple[str, dict]: + def update_hygiene_issue_prevalence(self, issue_type: HygieneIssueType) -> tuple[str, dict]: # Runs on App database query = read_template_sql_file("profile_anomaly_scoring.sql", sub_directory="profiling") params = { - "PROFILE_RUN_ID": self.profile_run_id, - "ANOMALY_ID": test_type["id"], - "PREV_FORMULA": test_type["dq_score_prevalence_formula"], - "RISK": test_type["dq_score_risk_factor"], + "PROFILE_RUN_ID": self.profiling_run.id, + "ANOMALY_ID": issue_type.id, + "PREV_FORMULA": issue_type.dq_score_prevalence_formula, + "RISK": issue_type.dq_score_risk_factor, } query = replace_params(query, params) return query, params - def GetDataCharsRefreshQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_data_chars_sql().GetDataCharsUpdateQuery() - - def GetCDEFlaggerQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("cde_flagger_query.sql") - - def GetProfileRunInfoRecordsQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("project_profile_run_record_insert.sql") - - def GetProfileRunInfoRecordUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("project_profile_run_record_update.sql") - - def GetDDFQuery(self) -> tuple[str, dict]: + def run_column_profiling(self, column_chars: ColumnChars, table_sampling: TableSampling | None = None) -> tuple[str, dict]: # Runs on Target database - return self._get_data_chars_sql().GetDDFQuery() - - def GetProfilingQuery(self) -> tuple[str, dict]: - # Runs on Target database - if not self.dctSnippetTemplate: - self.dctSnippetTemplate = read_template_yaml_file( - f"project_profiling_query_{self.flavor}.yaml", sub_directory=f"flavors/{self.flavor}/profiling" - ) - - dctSnippetTemplate = self.dctSnippetTemplate - - # Assemble in function - strQ = "" - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate01_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate01_else"] - - strQ += dctSnippetTemplate["strTemplate01_5"] - - if self.col_gen_type == "X": - strQ += dctSnippetTemplate["strTemplate02_X"] - else: - strQ += dctSnippetTemplate["strTemplate02_else"] - - if self.col_gen_type in ["A", "D", "N"]: - strQ += dctSnippetTemplate["strTemplate03_ADN"] - else: - strQ += dctSnippetTemplate["strTemplate03_else"] - - if self.col_gen_type == "A": - strQ += dctSnippetTemplate["strTemplate04_A"] - elif self.col_gen_type == "N": - strQ += dctSnippetTemplate["strTemplate04_N"] - else: - strQ += dctSnippetTemplate["strTemplate04_else"] - - if self.col_gen_type == "A": - strQ += dctSnippetTemplate["strTemplate05_A"] - else: - strQ += dctSnippetTemplate["strTemplate05_else"] - - if self.col_gen_type == "A" and self.parm_do_patterns == "Y": - strQ += dctSnippetTemplate["strTemplate06_A_patterns"] - else: - strQ += dctSnippetTemplate["strTemplate06_else"] - - strQ += dctSnippetTemplate["strTemplate07_else"] - - if self.col_gen_type == "N": - strQ += dctSnippetTemplate["strTemplate08_N"] - else: - strQ += dctSnippetTemplate["strTemplate08_else"] - - if self.col_gen_type == "N" and self.col_is_decimal == True: - strQ += dctSnippetTemplate["strTemplate10_N_dec"] - else: - strQ += dctSnippetTemplate["strTemplate10_else"] - - if self.col_gen_type == "D": - strQ += dctSnippetTemplate["strTemplate11_D"] - else: - strQ += dctSnippetTemplate["strTemplate11_else"] - if self.col_gen_type == "B": - strQ += dctSnippetTemplate["strTemplate12_B"] + template = self._get_profiling_template() + general_type = column_chars.general_type + + query = "" + query += template["01_sampling" if table_sampling else "01_else"] + query += template["01_all"] + query += template["02_X" if general_type == "X" else "02_else"] + query += template["03_ADN" if general_type in ["A", "D", "N"] else "03_else"] + + if general_type == "A": + query += template["04_A"] + elif general_type == "N": + query += template["04_N"] else: - strQ += dctSnippetTemplate["strTemplate12_else"] - - strQ += dctSnippetTemplate["strTemplate13_ALL"] - - if self.col_gen_type == "A": - if self.parm_do_patterns == "Y": - strQ += dctSnippetTemplate["strTemplate14_A_do_patterns"] - else: - strQ += dctSnippetTemplate["strTemplate14_A_no_patterns"] + query += template["04_else"] + + query += template["05_A" if general_type == "A" else "05_else"] + query += template["06_A" if general_type == "A" else "06_else"] + query += template["08_N" if general_type == "N" else "08_else"] + query += template["10_N_dec" if general_type == "N" and column_chars.is_decimal == True else "10_else"] + query += template["11_D" if general_type == "D" else "11_else"] + query += template["12_B" if general_type == "B" else "12_else"] + query += template["14_A" if general_type == "A" else "14_else"] + query += template["16_all"] + query += template["98_all"] + + if general_type == "N": + query += template["99_N_sampling" if table_sampling else "99_N"] else: - strQ += dctSnippetTemplate["strTemplate14_else"] - - strQ += dctSnippetTemplate["strTemplate15_ALL"] - - strQ += dctSnippetTemplate["strTemplate16_ALL"] - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate98_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate98_else"] - - if self.col_gen_type == "N": - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate99_N_sampling"] - else: - strQ += dctSnippetTemplate["strTemplate99_N"] - else: - strQ += dctSnippetTemplate["strTemplate99_else"] - - if self.parm_do_sample == "Y": - strQ += dctSnippetTemplate["strTemplate100_sampling"] + query += template["99_else"] - params = self._get_params() - query = replace_params(strQ, params) + params = self._get_params(column_chars, table_sampling) + query = replace_params(query, params) query = replace_templated_functions(query, self.flavor) return query, params - def GetSecondProfilingQuery(self) -> tuple[str, dict]: - # Runs on Target database - return self._get_query(f"project_secondary_profiling_query_{self.flavor}.sql", f"flavors/{self.flavor}/profiling") - - def GetTableSampleCount(self) -> tuple[str, dict]: - # Runs on Target database - return self._get_query(f"project_get_table_sample_count_{self.flavor}.sql", f"flavors/{self.flavor}/profiling") - - def GetContingencyColumns(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("contingency_columns.sql") - - def GetContingencyCounts(self) -> tuple[str, dict]: + def get_profiling_errors(self, column_errors: list[tuple[ColumnChars, str]]) -> list[list[str | UUID | int]]: + return [ + [ + self.table_group.project_code, + self.connection.connection_id, + self.table_group.id, + self.table_group.table_group_schema, + self.profiling_run.id, + self.profiling_run.profiling_starttime, + column_chars.table_name, + column_chars.column_name.replace("'", "''"), + column_chars.ordinal_position, + column_chars.column_type, + "X", + column_chars.db_data_type, + column_chars.record_ct, + error[:self.max_error_length], + ] for column_chars, error in column_errors + ] + + def run_frequency_analysis(self, column_chars: ColumnChars, table_sampling: TableSampling | None = None) -> tuple[str, dict]: # Runs on Target database - return self._get_query("contingency_counts.sql", "flavors/generic/profiling") - - def UpdateProfileResultsToEst(self) -> tuple[str, dict]: + return self._get_query( + "project_secondary_profiling_query.sql", + f"flavors/{self.flavor}/profiling", + extra_params={"do_sample_bool": table_sampling is not None}, + column_chars=column_chars, + table_sampling=table_sampling, + ) + + def update_sampled_profiling_results(self, table_sampling: TableSampling) -> tuple[str, dict]: # Runs on App database - return self._get_query("project_update_profile_results_to_estimates.sql") + return self._get_query("project_update_profile_results_to_estimates.sql", table_sampling=table_sampling) diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index d6a0359d..9ef02506 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -1,99 +1,155 @@ +import dataclasses +from collections.abc import Iterable +from datetime import datetime + from testgen.common import read_template_sql_file from testgen.common.database.database_service import get_flavor_service, replace_params -from testgen.common.database.flavor.flavor_service import SQLFlavor +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup from testgen.utils import chunk_queries -class CRefreshDataCharsSQL: - run_date: str - source_table: str - - project_code: str - sql_flavor: SQLFlavor - table_group_schema: str - table_group_id: str +@dataclasses.dataclass +class ColumnChars: + schema_name: str + table_name: str + column_name: str + ordinal_position: int = None + general_type: str = None + column_type: str = None + db_data_type: str = None + is_decimal: bool = False + approx_record_ct: int = None + record_ct: int = None - max_query_chars: int - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - def __init__(self, params: dict, run_date: str, source_table: str): - self.run_date = run_date - self.source_table = source_table +class RefreshDataCharsSQL: - self.project_code = params["project_code"] - self.sql_flavor = params["sql_flavor"] - self.table_group_schema = params["table_group_schema"] - self.table_group_id = params["table_groups_id"] + staging_table = "stg_data_chars_updates" + staging_columns = ( + "table_groups_id", + "run_date", + "schema_name", + "table_name", + "column_name", + "position", + "general_type", + "column_type", + "db_data_type", + "approx_record_ct", + "record_ct", + ) - self.max_query_chars = params["max_query_chars"] - self.profiling_table_set = params["profiling_table_set"] - self.profiling_include_mask = params["profiling_include_mask"] - self.profiling_exclude_mask = params["profiling_exclude_mask"] + def __init__(self, connection: Connection, table_group: TableGroup): + self.connection = connection + self.table_group = table_group + self.flavor = connection.sql_flavor + self.flavor_service = get_flavor_service(self.flavor) - def _get_query(self, template_file_name: str, sub_directory: str | None = "data_chars") -> tuple[str, dict]: + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "data_chars", + extra_params: dict | None = None, + ) -> tuple[str, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = { - "PROJECT_CODE": self.project_code, - "DATA_SCHEMA": self.table_group_schema, - "TABLE_GROUPS_ID": self.table_group_id, - "RUN_DATE": self.run_date, - "SOURCE_TABLE": self.source_table, + "DATA_SCHEMA": self.table_group.table_group_schema, + "TABLE_GROUPS_ID": self.table_group.id, } + if extra_params: + params.update(extra_params) query = replace_params(query, params) return query, params def _get_table_criteria(self) -> str: table_criteria = "" - flavor_service = get_flavor_service(self.sql_flavor) - - if self.profiling_table_set: - table_criteria += f" AND c.{flavor_service.ddf_table_ref} IN ({self.profiling_table_set})" + ddf_table_ref = self.flavor_service.ddf_table_ref + escaped_underscore = self.flavor_service.escaped_underscore + escape_clause = self.flavor_service.escape_clause + + if self.table_group.profiling_table_set: + quoted_table_names = ",".join( + [f"'{item.strip()}'" for item in self.table_group.profiling_table_set.split(",")] + ) + table_criteria += f" AND c.{ddf_table_ref} IN ({quoted_table_names})" - if self.profiling_include_mask: + if self.table_group.profiling_include_mask: include_table_names = [ - item.strip().replace("_", flavor_service.escaped_underscore) - for item in self.profiling_include_mask.split(",") + item.strip().replace("_", escaped_underscore) + for item in self.table_group.profiling_include_mask.split(",") ] table_criteria += f""" AND ( - {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in include_table_names ])} + {" OR ".join([ f"(c.{ddf_table_ref} LIKE '{item}' {escape_clause})" for item in include_table_names ])} ) """ - if self.profiling_exclude_mask: + if self.table_group.profiling_exclude_mask: exclude_table_names = [ - item.strip().replace("_", flavor_service.escaped_underscore) - for item in self.profiling_exclude_mask.split(",") + item.strip().replace("_", escaped_underscore) + for item in self.table_group.profiling_exclude_mask.split(",") ] table_criteria += f""" AND NOT ( - {" OR ".join([ f"(c.{flavor_service.ddf_table_ref} LIKE '{item}' {flavor_service.escape_clause})" for item in exclude_table_names ])} + {" OR ".join([ f"(c.{ddf_table_ref} LIKE '{item}' {escape_clause})" for item in exclude_table_names ])} ) """ return table_criteria - def GetDDFQuery(self) -> tuple[str, dict]: + def get_schema_ddf(self) -> tuple[str, dict]: # Runs on Target database - query, params = self._get_query(f"schema_ddf_query_{self.sql_flavor}.sql", f"flavors/{self.sql_flavor}/data_chars") - query = query.replace("{TABLE_CRITERIA}", self._get_table_criteria()) - return query, params + return self._get_query( + "get_schema_ddf.sql", + f"flavors/{self.flavor}/data_chars", + extra_params={"TABLE_CRITERIA": self._get_table_criteria()}, + ) - def GetRecordCountQueries(self, schema_tables: list[str]) -> list[tuple[str, None]]: + def get_row_counts(self, table_names: Iterable[str]) -> list[tuple[str, None]]: # Runs on Target database + schema = self.table_group.table_group_schema + quote = self.flavor_service.quote_character count_queries = [ - f"SELECT '{item}', COUNT(*) FROM {item}" - for item in schema_tables + f"SELECT '{table}', COUNT(*) FROM {quote}{schema}{quote}.{quote}{table}{quote}" + for table in table_names ] - chunked_queries = chunk_queries(count_queries, " UNION ALL ", self.max_query_chars) + chunked_queries = chunk_queries(count_queries, " UNION ALL ", self.connection.max_query_chars) return [ (query, None) for query in chunked_queries ] - def GetDataCharsUpdateQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("data_chars_update.sql") + def verify_access(self, table_name: str) -> tuple[str, None]: + # Runs on Target database + schema = self.table_group.table_group_schema + quote = self.flavor_service.quote_character + query = ( + f"SELECT 1 FROM {quote}{schema}{quote}.{quote}{table_name}{quote} LIMIT 1" + if not self.flavor_service.use_top + else f"SELECT TOP 1 * FROM {quote}{schema}{quote}.{quote}{table_name}{quote}" + ) + return (query, None) - def GetStagingDeleteQuery(self) -> tuple[str, dict]: + def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: datetime) -> list[list[str | bool | int]]: + return [ + [ + self.table_group.id, + run_date, + column.schema_name, + column.table_name, + column.column_name, + column.ordinal_position, + column.general_type, + column.column_type, + column.db_data_type, + column.approx_record_ct, + column.record_ct, + ] + for column in data_chars + ] + + def update_data_chars(self, run_date: str) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("data_chars_staging_delete.sql") + params = {"RUN_DATE": run_date} + return [ + self._get_query("data_chars_update.sql", extra_params=params), + self._get_query("data_chars_staging_delete.sql", extra_params=params), + ] diff --git a/testgen/commands/queries/rollup_scores_query.py b/testgen/commands/queries/rollup_scores_query.py index dde0d556..0d6bfc49 100644 --- a/testgen/commands/queries/rollup_scores_query.py +++ b/testgen/commands/queries/rollup_scores_query.py @@ -4,35 +4,47 @@ from testgen.common.database.database_service import replace_params -class CRollupScoresSQL: +class RollupScoresSQL: run_id: str - table_group_id: str + table_group_id: str | None def __init__(self, run_id: str, table_group_id: str | UUID | None = None): self.run_id = run_id - self.table_group_id = str(table_group_id) + self.table_group_id = str(table_group_id) if table_group_id is not None else None - def _get_query(self, template_file_name: str, sub_directory: str | None = "rollup_scores") -> tuple[str, dict]: + def _get_query( + self, + template_file_name: str, + sub_directory: str | None = "rollup_scores", + no_bind: bool = False, + ) -> tuple[str, dict]: query = read_template_sql_file(template_file_name, sub_directory) params = { "RUN_ID": self.run_id, - "TABLE_GROUPS_ID": self.table_group_id or "" + "TABLE_GROUPS_ID": self.table_group_id or "", } query = replace_params(query, params) - return query, params + return query, None if no_bind else params - def GetRollupScoresProfileRunQuery(self) -> tuple[str, dict]: + def rollup_profiling_scores(self) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("rollup_scores_profile_run.sql") + queries = [ + self._get_query("rollup_scores_profile_run.sql"), + ] + if self.table_group_id: + queries.append(self._get_query("rollup_scores_profile_table_group.sql")) + return queries - def GetRollupScoresProfileTableGroupQuery(self) -> tuple[str, dict]: + def rollup_test_scores(self, update_prevalence: bool = False, update_table_group: bool = False) -> list[tuple[str, dict]]: # Runs on App database - return self._get_query("rollup_scores_profile_table_group.sql") - - def GetRollupScoresTestRunQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("rollup_scores_test_run.sql") - - def GetRollupScoresTestTableGroupQuery(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("rollup_scores_test_table_group.sql") + queries = [] + + if update_prevalence: + queries.append(self._get_query("calc_prevalence_test_results.sql", no_bind=True)) + + queries.append(self._get_query("rollup_scores_test_run.sql")) + + if update_table_group: + queries.append(self._get_query("rollup_scores_test_table_group.sql")) + + return queries diff --git a/testgen/commands/queries/test_parameter_validation_query.py b/testgen/commands/queries/test_parameter_validation_query.py deleted file mode 100644 index c7f40c35..00000000 --- a/testgen/commands/queries/test_parameter_validation_query.py +++ /dev/null @@ -1,78 +0,0 @@ -import typing - -from testgen.common import CleanSQL, date_service, read_template_sql_file -from testgen.common.database.database_service import get_flavor_service, replace_params - - -class CTestParamValidationSQL: - flavor = "" - run_date = "" - test_run_id = "" - test_schemas: str = "" - message = "" - test_ids: typing.ClassVar = [] - exception_message = "" - flag_val = "" - tg_schema = "" - - _use_clean = False - - def __init__(self, strFlavor, strTestSuiteId): - self.flavor = strFlavor - self.flavor_service = get_flavor_service(strFlavor) - self.test_suite_id = strTestSuiteId - self.today = date_service.get_now_as_string() - - def _get_query(self, template_file_name: str, sub_directory: str | None = "validate_tests") -> tuple[str, dict]: - query = read_template_sql_file(template_file_name, sub_directory) - params = { - "TEST_SUITE_ID": self.test_suite_id, - "RUN_DATE": self.run_date, - "TEST_RUN_ID": self.test_run_id, - "FLAG": self.flag_val, - "TEST_SCHEMAS": self.test_schemas, - "EXCEPTION_MESSAGE": self.exception_message, - "MESSAGE": self.message, - "CAT_TEST_IDS": tuple(self.test_ids or []), - "START_TIME": self.today, - "NOW_TIMESTAMP": date_service.get_now_as_string(), - "DATA_SCHEMA": self.tg_schema, - "QUOTE": self.flavor_service.quote_character, - } - query = replace_params(query, params) - return query, params - - def GetTestValidationColumns(self) -> tuple[str, dict]: - # Runs on App database - query, params = self._get_query("ex_get_test_column_list_tg.sql") - if self._use_clean: - query = CleanSQL(query) - return query, params - - def GetProjectTestValidationColumns(self) -> tuple[str, dict]: - # Runs on Target database - filename = "ex_get_project_column_list.sql" - try: - return self._get_query(filename, f"flavors/{self.flavor}/validate_tests") - except ModuleNotFoundError: - return self._get_query(filename, "flavors/generic/validate_tests") - - def PrepFlagTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_prep_flag_tests_test_definitions.sql") - - def FlagTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_flag_tests_test_definitions.sql") - - def DisableTestsWithFailedValidation(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_disable_tests_test_definitions.sql") - - def ReportTestValidationErrors(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_write_test_val_errors.sql") - - def PushTestRunStatusUpdateSQL(self) -> tuple[str, dict]: - # Runs on App database - return self._get_query("ex_update_test_record_in_testrun_table.sql", "execution") diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py deleted file mode 100644 index 15d30a14..00000000 --- a/testgen/commands/run_execute_cat_tests.py +++ /dev/null @@ -1,148 +0,0 @@ -import logging -from datetime import UTC, datetime - -from progress.spinner import Spinner - -from testgen import settings -from testgen.commands.queries.execute_cat_tests_query import CCATExecutionSQL -from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - write_to_app_db, -) -from testgen.common.get_pipeline_parms import TestExecutionParams -from testgen.common.mixpanel_service import MixpanelService - -LOG = logging.getLogger("testgen") - - -def FinalizeTestRun(clsCATExecute: CCATExecutionSQL, username: str | None = None): - _, row_counts = execute_db_queries([ - clsCATExecute.FinalizeTestResultsSQL(), - clsCATExecute.PushTestRunStatusUpdateSQL(), - clsCATExecute.FinalizeTestSuiteUpdateSQL(), - ]) - end_time = datetime.now(UTC) - - try: - execute_db_queries([ - clsCATExecute.CalcPrevalenceTestResultsSQL(), - clsCATExecute.TestScoringRollupRunSQL(), - clsCATExecute.TestScoringRollupTableGroupSQL(), - ]) - run_refresh_score_cards_results( - project_code=clsCATExecute.project_code, - add_history_entry=True, - refresh_date=date_service.parse_now(clsCATExecute.run_date), - ) - except Exception: - LOG.exception("Error refreshing scores after test run") - pass - - MixpanelService().send_event( - "run-tests", - source=settings.ANALYTICS_JOB_SOURCE, - username=username, - sql_flavor=clsCATExecute.flavor, - test_count=row_counts[0], - run_duration=(end_time - date_service.parse_now(clsCATExecute.run_date)).total_seconds(), - scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), - ) - - -def run_cat_test_queries( - params: TestExecutionParams, - test_run_id: str, - test_time: str, - project_code: str, - test_suite: str, - error_msg: str, - username: str | None = None, - minutes_offset: int = 0, - spinner: Spinner | None = None -): - has_errors = False - - LOG.info("CurrentStep: Initializing CAT Query Generator") - clsCATExecute = CCATExecutionSQL( - project_code, params["test_suite_id"], test_suite, params["sql_flavor"], params["max_query_chars"], minutes_offset - ) - clsCATExecute.test_run_id = test_run_id - clsCATExecute.run_date = test_time - clsCATExecute.table_groups_id = params["table_groups_id"] - clsCATExecute.exception_message += error_msg - - # START TEST EXECUTION - - if spinner: - spinner.next() - - lstAllResults = [] - - try: - # Retrieve distinct target tables from metadata - LOG.info("CurrentStep: Retrieving Target Tables") - # Gets distinct list of tables to be tested, to aggregate tests by table, from dk db - lstTables = fetch_dict_from_db(*clsCATExecute.GetDistinctTablesSQL()) - LOG.info("Test Tables Identified: %s", len(lstTables)) - - if lstTables: - LOG.info("CurrentStep: Aggregating CAT Tests per Table") - for dctTable in lstTables: - clsCATExecute.target_schema = dctTable["schema_name"] - clsCATExecute.target_table = dctTable["table_name"] - # Writes records of aggregated tests per table and sequence number - # (to prevent table queries from getting too large) to dk db. - execute_db_queries([clsCATExecute.GetAggregateTableTestSQL()]) - - LOG.info("CurrentStep: Retrieving CAT Tests to Run") - # Retrieves records of aggregated tests to run as queries from dk db - lstCATParms = fetch_dict_from_db(*clsCATExecute.GetAggregateTestParmsSQL()) - - lstCATQueries = [] - # Prepares CAT Queries and populates query list - LOG.info("CurrentStep: Preparing CAT Queries") - for dctCATQuery in lstCATParms: - clsCATExecute.target_schema = dctCATQuery["schema_name"] - clsCATExecute.target_table = dctCATQuery["table_name"] - clsCATExecute.cat_test_params = dctCATQuery - lstCATQueries.append(clsCATExecute.PrepCATQuerySQL()) - - if lstCATQueries: - LOG.info("CurrentStep: Performing CAT Tests") - lstAllResults, lstResultColumnNames, intErrors = fetch_from_db_threaded( - lstCATQueries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner - ) - - if lstAllResults: - LOG.info("CurrentStep: Saving CAT Results") - # Write aggregate result records to aggregate result table at dk db - write_to_app_db(lstAllResults, lstResultColumnNames, "working_agg_cat_results") - LOG.info("CurrentStep: Parsing CAT Results") - # Parses aggregate results to individual test_result records at dk db - execute_db_queries([clsCATExecute.GetCATResultsParseSQL()]) - LOG.info("Test results successfully parsed.") - if intErrors > 0: - has_errors = True - cat_error_msg = f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) Please check log." - LOG.warning(cat_error_msg) - clsCATExecute.exception_message += cat_error_msg - else: - LOG.info("No valid tests were available to perform") - - except Exception as e: - has_errors = True - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsCATExecute.exception_message += f"{type(e).__name__}: {errorline}" - raise - - else: - return has_errors - - finally: - LOG.info("Finalizing test run") - FinalizeTestRun(clsCATExecute, username) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py deleted file mode 100644 index af9dd9fd..00000000 --- a/testgen/commands/run_execute_tests.py +++ /dev/null @@ -1,199 +0,0 @@ -import logging -import subprocess -import threading -import uuid - -from progress.spinner import Spinner - -import testgen.common.process_service as process_service -from testgen import settings -from testgen.commands.queries.execute_tests_query import CTestExecutionSQL -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - get_test_execution_params, - set_target_db_params, - write_to_app_db, -) -from testgen.common.database.database_service import empty_cache -from testgen.common.get_pipeline_parms import TestExecutionParams -from testgen.common.models import with_database_session -from testgen.common.models.connection import Connection -from testgen.ui.session import session - -from .run_execute_cat_tests import run_cat_test_queries -from .run_refresh_data_chars import run_refresh_data_chars_queries -from .run_test_parameter_validation import run_parameter_validation_queries - -LOG = logging.getLogger("testgen") - - -def add_test_run_record(test_run_id: str, test_suite_id: str, test_time: str, process_id: int): - execute_db_queries([( - """ - INSERT INTO test_runs(id, test_suite_id, test_starttime, process_id) - (SELECT :test_run_id as id, - :test_suite_id as test_suite_id, - :test_time as test_starttime, - :process_id as process_id); - """, - { - "test_run_id": test_run_id, - "test_suite_id": test_suite_id, - "test_time": test_time, - "process_id": process_id, - } - )]) - - -def run_test_queries( - params: TestExecutionParams, - test_run_id: str, - test_time: str, - project_code: str, - test_suite: str, - minutes_offset: int = 0, - spinner: Spinner | None = None, -): - has_errors = False - error_msg = "" - - LOG.info("CurrentStep: Initializing Query Generator") - - clsExecute = CTestExecutionSQL(project_code, params["sql_flavor"], params["test_suite_id"], test_suite, minutes_offset) - clsExecute.run_date = test_time - clsExecute.test_run_id = test_run_id - clsExecute.process_id = process_service.get_current_process_id() - - try: - # Update Historic Test Thresholds - LOG.info("CurrentStep: Updating Historic Test Thresholds") - execute_db_queries([clsExecute.GetHistoricThresholdUpdate()]) - - # Retrieve non-CAT Queries - LOG.info("CurrentStep: Retrieve Non-CAT Queries") - lstTestSet = fetch_dict_from_db(*clsExecute.GetTestsNonCAT()) - - if len(lstTestSet) == 0: - LOG.debug("0 non-CAT Queries retrieved.") - - if lstTestSet: - LOG.info("CurrentStep: Preparing Non-CAT Tests") - lstTestQueries = [] - for dctTest in lstTestSet: - clsExecute.test_params = dctTest - lstTestQueries.append(clsExecute.GetTestQuery()) - if spinner: - spinner.next() - - # Execute list, returning test results - LOG.info("CurrentStep: Executing Non-CAT Test Queries") - lstTestResults, colResultNames, intErrors = fetch_from_db_threaded( - lstTestQueries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner - ) - - # Copy test results to DK DB - LOG.info("CurrentStep: Saving Non-CAT Test Results") - if lstTestResults: - write_to_app_db(lstTestResults, colResultNames, "test_results") - if intErrors > 0: - has_errors = True - error_msg = ( - f"Errors were encountered executing Referential Tests. ({intErrors} errors occurred.) " - "Please check log. " - ) - LOG.warning(error_msg) - else: - LOG.info("No tests found") - - except Exception as e: - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsExecute.exception_message = f"{type(e).__name__}: {errorline}" - LOG.info("Updating the test run record with exception message") - execute_db_queries([clsExecute.PushTestRunStatusUpdateSQL()]) - raise - - else: - return has_errors, error_msg - - -def run_execution_steps_in_background(project_code, test_suite): - msg = f"Starting run_execution_steps_in_background against test suite: {test_suite}" - if settings.IS_DEBUG: - LOG.info(msg + ". Running in debug mode (new thread instead of new process).") - empty_cache() - username = None - if session.auth: - username = session.auth.user_display - background_thread = threading.Thread( - target=run_execution_steps, - args=(project_code, test_suite, username), - ) - background_thread.start() - else: - LOG.info(msg) - script = ["testgen", "run-tests", "--project-key", project_code, "--test-suite-key", test_suite] - subprocess.Popen(script) # NOQA S603 - - -@with_database_session -def run_execution_steps( - project_code: str, - test_suite: str, - username: str | None = None, - minutes_offset: int = 0, - spinner: Spinner | None = None, -) -> str: - # Initialize required parms for all steps - has_errors = False - error_msg = "" - - test_run_id = str(uuid.uuid4()) - test_time = date_service.get_now_as_string_with_offset(minutes_offset) - - if spinner: - spinner.next() - - LOG.info("CurrentStep: Retrieving TestExec Parameters") - test_exec_params = get_test_execution_params(project_code, test_suite) - - # Add a record in Test Run table for the new Test Run - add_test_run_record( - test_run_id, test_exec_params["test_suite_id"], test_time, process_service.get_current_process_id() - ) - - LOG.info("CurrentStep: Assigning Connection Parameters") - connection = Connection.get_by_table_group(test_exec_params["table_groups_id"]) - set_target_db_params(connection.__dict__) - test_exec_params["sql_flavor"] = connection.sql_flavor - test_exec_params["max_query_chars"] = connection.max_query_chars - test_exec_params["max_threads"] = connection.max_threads - - try: - LOG.info("CurrentStep: Execute Step - Data Characteristics Refresh") - run_refresh_data_chars_queries(test_exec_params, test_time, spinner) - except Exception: - LOG.warning("Data Characteristics Refresh failed", exc_info=True, stack_info=True) - pass - - LOG.info("CurrentStep: Execute Step - Test Validation") - run_parameter_validation_queries(test_exec_params, test_run_id, test_time, test_suite) - - LOG.info("CurrentStep: Execute Step - Test Execution") - has_errors, error_msg = run_test_queries( - test_exec_params, test_run_id, test_time, project_code, test_suite, minutes_offset, spinner - ) - - LOG.info("CurrentStep: Execute Step - CAT Test Execution") - if run_cat_test_queries( - test_exec_params, test_run_id, test_time, project_code, test_suite, error_msg, username, minutes_offset, spinner - ): - has_errors = True - - return f""" - Test execution completed {"with errors. Check log for details." if has_errors else "successfully."} - Run ID: {test_run_id} - """ diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index f65a80ec..0d926fbe 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -2,13 +2,11 @@ import os from testgen import settings -from testgen.common import create_database, date_service, execute_db_queries +from testgen.common import create_database, execute_db_queries from testgen.common.credentials import get_tg_db, get_tg_schema from testgen.common.database.database_service import get_queries_for_command from testgen.common.encrypt import EncryptText, encrypt_ui_password from testgen.common.models import with_database_session -from testgen.common.models.scores import ScoreDefinition -from testgen.common.models.table_group import TableGroup from testgen.common.read_file import get_template_files from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml @@ -24,14 +22,12 @@ def _get_latest_revision_number(): def _get_params_mapping() -> dict: ui_user_encrypted_password = encrypt_ui_password(settings.PASSWORD) - now = date_service.get_now_as_string() return { "UI_USER_NAME": settings.USERNAME, "UI_USER_USERNAME": settings.USERNAME, "UI_USER_EMAIL": "", "UI_USER_ENCRYPTED_PASSWORD": ui_user_encrypted_password, "SCHEMA_NAME": get_tg_schema(), - "START_DATE": now, "PROJECT_CODE": settings.PROJECT_KEY, "CONNECTION_ID": 1, "SQL_FLAVOR": settings.PROJECT_SQL_FLAVOR, @@ -86,17 +82,9 @@ def run_launch_db_config(delete_db: bool, drop_users_and_roles: bool = True) -> user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) import_metadata_records_from_yaml(params_mapping) - ScoreDefinition.from_table_group( - TableGroup( - project_code=settings.PROJECT_KEY, - table_groups_name=settings.DEFAULT_TABLE_GROUPS_NAME, - ) - ).save() - def get_app_db_params_mapping() -> dict: return _get_params_mapping() diff --git a/testgen/commands/run_pairwise_contingency_check.py b/testgen/commands/run_pairwise_contingency_check.py new file mode 100644 index 00000000..9df98406 --- /dev/null +++ b/testgen/commands/run_pairwise_contingency_check.py @@ -0,0 +1,147 @@ +# UNUSED CODE - TO BE REVIVED LATER + +from uuid import UUID + +import pandas as pd + +from testgen.commands.queries.contingency_query import ContingencySQL +from testgen.commands.queries.profiling_query import ContingencyTable +from testgen.common.database.database_service import fetch_dict_from_db, write_to_app_db + + +def run_pairwise_contingency_check(profiling_run_id: UUID, threshold_ratio: float) -> None: + # Goal: identify pairs of values that represent IF X=A THEN Y=B rules + + threshold_ratio = threshold_ratio / 100.0 if threshold_ratio else 0.95 + + sql_generator = ContingencySQL() + table_columns = fetch_dict_from_db(*sql_generator.get_contingency_columns(profiling_run_id)) + + if not table_columns: + return + + table_columns = [ContingencyTable(item) for item in table_columns] + df_merged = None + for table in table_columns: + counts = fetch_dict_from_db( + *sql_generator.get_contingency_counts(table), + use_target_db=True, + ) + if counts: + df = pd.DataFrame(counts) + columns = table.contingency_columns.lower().split(",") + overall_counts = {col: df.groupby(col)["freq_ct"].sum() for col in columns} + + contingency_table = [] + for i, col1 in enumerate(columns): + for col2 in columns[i + 1 :]: + # Create a pivot table for each pair + pivot = df.pivot_table(index=col1, columns=col2, values="freq_ct", aggfunc="sum", fill_value=0) + pivot = pivot.stack().reset_index() + pivot.rename(columns={0: "pair_count"}, inplace=True) + + pivot["first_column_overall_count"] = pivot[col1].map(overall_counts[col1]) + pivot["second_column_overall_count"] = pivot[col2].map(overall_counts[col2]) + + pivot["first_column_name"] = col1 + pivot["second_column_name"] = col2 + + contingency_table.append(pivot) + + # Combine all pairs into a single DataFrame + contingency_table = pd.concat(contingency_table, ignore_index=True) + + contingency_table["pair_to_first_ratio"] = ( + contingency_table["pair_count"] / contingency_table["first_column_overall_count"] + ) + contingency_table["pair_to_second_ratio"] = ( + contingency_table["pair_count"] / contingency_table["second_column_overall_count"] + ) + + # Include rows where both cols meet minimum threshold count (max of 30 or 5%) + total_observations = contingency_table["pair_count"].sum() + threshold_min = max(total_observations * 0.05, 30) + contingency_table = contingency_table[ + (contingency_table["first_column_overall_count"] >= threshold_min) + & (contingency_table["second_column_overall_count"] >= threshold_min) + ] + # Drop rows where neither ratio meets the threshold ratio (keep if either meets it) + # -- note we still have to check individual columns when saving pairs + contingency_table = contingency_table[ + ~( + (contingency_table["pair_to_first_ratio"] < threshold_ratio) + & (contingency_table["pair_to_second_ratio"] < threshold_ratio) + ) + ] + + contingency_table["profiling_run_id"] = profiling_run_id + contingency_table["schema_name"] = table.schema_name + contingency_table["table_name"] = table.table_name + + if df_merged is None: + df_merged = contingency_table + else: + df_merged = pd.concat([df_merged, contingency_table], ignore_index=True) + + save_contingency_rules(df_merged, threshold_ratio) + + +def save_contingency_rules(df: pd.DataFrame, threshold_ratio: float) -> None: + if df is None or df.empty: + return + + contingency_rules = [] + for row in df.itertuples(): + # First causes second: almost all of first coincide with second value + if row.pair_to_first_ratio >= threshold_ratio: + contingency_rules.append( + [ + row.profiling_run_id, + row.schema_name, + row.table_name, + row.first_column_name, + getattr(row, row.first_column_name), + row.second_column_name, + getattr(row, row.second_column_name), + row.pair_count, + row.first_column_overall_count, + row.second_column_overall_count, + row.pair_to_first_ratio, + ] + ) + + # Second causes first: almost all of second coincide with first value + if row.pair_to_second_ratio >= threshold_ratio: + contingency_rules.append( + [ + row.profiling_run_id, + row.schema_name, + row.table_name, + row.second_column_name, + getattr(row, row.second_column_name), + row.first_column_name, + getattr(row, row.first_column_name), + row.pair_count, + row.second_column_overall_count, + row.first_column_overall_count, + row.pair_to_second_ratio, + ] + ) + + write_to_app_db( + contingency_rules, + [ + "profile_run_id", + "schema_name", + "table_name", + "cause_column_name", + "cause_column_value", + "effect_column_name", + "effect_column_value", + "pair_count", + "cause_column_total", + "effect_column_total", + "rule_ratio", + ], + "profile_pair_rules", + ) diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py new file mode 100644 index 00000000..de217cb9 --- /dev/null +++ b/testgen/commands/run_profiling.py @@ -0,0 +1,323 @@ +import logging +import subprocess +import threading +from datetime import UTC, datetime, timedelta +from uuid import UUID + +import testgen.common.process_service as process_service +from testgen import settings +from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling +from testgen.commands.queries.refresh_data_chars_query import ColumnChars +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL +from testgen.commands.run_generate_tests import run_test_gen_queries +from testgen.commands.run_refresh_data_chars import run_data_chars_refresh +from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results +from testgen.commands.run_test_execution import run_test_execution_in_background +from testgen.common import ( + execute_db_queries, + fetch_dict_from_db, + fetch_from_db_threaded, + set_target_db_params, + write_to_app_db, +) +from testgen.common.database.database_service import ThreadedProgress, empty_cache +from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models import with_database_session +from testgen.common.models.connection import Connection +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_suite import TestSuite +from testgen.ui.session import session +from testgen.utils import get_exception_message + +LOG = logging.getLogger("testgen") + + +def run_profiling_in_background(table_group_id: str | UUID) -> None: + msg = f"Triggering profiling run for table group {table_group_id}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread( + target=run_profiling, + args=(table_group_id, session.auth.user_display if session.auth else None), + ) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-profile", "-tg", str(table_group_id)] + subprocess.Popen(script) # NOQA S603 + + +@with_database_session +def run_profiling(table_group_id: str | UUID, username: str | None = None, run_date: datetime | None = None) -> str: + if table_group_id is None: + raise ValueError("Table Group ID was not specified") + + LOG.info(f"Starting profiling run for table group {table_group_id}") + time_delta = (run_date - datetime.now(UTC)) if run_date else timedelta() + + LOG.info("Retrieving connection and table group parameters") + table_group = TableGroup.get(table_group_id) + connection = Connection.get(table_group.connection_id) + set_target_db_params(connection.__dict__) + + LOG.info("Creating profiling run record") + profiling_run = ProfilingRun( + project_code=table_group.project_code, + connection_id=connection.connection_id, + table_groups_id=table_group.id, + profiling_starttime=datetime.now(UTC) + time_delta, + process_id=process_service.get_current_process_id(), + ) + profiling_run.init_progress() + profiling_run.set_progress("data_chars", "Running") + profiling_run.save() + + LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") + try: + data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime) + distinct_tables = {(column.table_name, column.record_ct) for column in data_chars} + + profiling_run.set_progress("data_chars", "Completed") + profiling_run.table_ct = len(distinct_tables) + profiling_run.column_ct = len(data_chars) + profiling_run.record_ct = sum(table[1] for table in distinct_tables) + profiling_run.data_point_ct = sum(column.record_ct for column in data_chars) + + if data_chars: + sql_generator = ProfilingSQL(connection, table_group, profiling_run) + + _run_column_profiling(sql_generator, data_chars) + _run_frequency_analysis(sql_generator) + _run_hygiene_issue_detection(sql_generator) + + # if table_group.profile_do_pair_rules == "Y": + # LOG.info("Compiling pairwise contingency rules") + # run_pairwise_contingency_check(profiling_run.id, table_group.profile_pair_rule_pct) + else: + LOG.info("No columns were selected to profile.") + except Exception as e: + LOG.exception("Profiling encountered an error.") + LOG.info("Setting profiling run status to Error") + profiling_run.log_message = get_exception_message(e) + profiling_run.profiling_endtime = datetime.now(UTC) + time_delta + profiling_run.status = "Error" + profiling_run.save() + else: + LOG.info("Setting profiling run status to Completed") + profiling_run.profiling_endtime = datetime.now(UTC) + time_delta + profiling_run.status = "Complete" + profiling_run.save() + + _rollup_profiling_scores(profiling_run, table_group) + + if bool(table_group.monitor_test_suite_id) and not table_group.last_complete_profile_run_id: + _generate_monitor_tests(table_group_id, table_group.monitor_test_suite_id) + finally: + MixpanelService().send_event( + "run-profiling", + source=settings.ANALYTICS_JOB_SOURCE, + username=username, + sql_flavor=connection.sql_flavor_code, + sampling=table_group.profile_use_sampling, + table_count=profiling_run.table_ct or 0, + column_count=profiling_run.column_ct or 0, + run_duration=(profiling_run.profiling_endtime - profiling_run.profiling_starttime).total_seconds(), + scoring_duration=(datetime.now(UTC) + time_delta - profiling_run.profiling_endtime).total_seconds(), + ) + + return f""" + {"Profiling encountered an error. Check log for details." if profiling_run.status == "Error" else "Profiling completed."} + Run ID: {profiling_run.id} + """ + + +def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("col_profiling", "Running") + profiling_run.save() + + LOG.info(f"Running column profiling queries: {len(data_chars)}") + table_group = sql_generator.table_group + sampling_params: dict[str, TableSampling] = {} + sample_percent = ( + float(table_group.profile_sample_percent) + if str(table_group.profile_sample_percent).replace(".", "", 1).isdigit() + else 30 + ) + if table_group.profile_use_sampling and 0 < sample_percent < 100: + min_sample = table_group.profile_sample_min_count + max_sample = 999000 + for column in data_chars: + if not sampling_params.get(column.table_name) and column.record_ct > min_sample: + calc_sample = round(sample_percent * column.record_ct / 100) + sample_count = min(max(calc_sample, min_sample), max_sample) + + sampling_params[column.table_name] = TableSampling( + table_name=column.table_name, + sample_count=sample_count, + sample_ratio=column.record_ct / sample_count, + sample_percent=round(100 * sample_count / column.record_ct, 4), + ) + + def update_column_progress(progress: ThreadedProgress) -> None: + profiling_run.set_progress( + "col_profiling", + "Running", + detail=f"{progress['processed']} of {progress['total']}", + error=f"{progress['errors']} column{'s' if progress['errors'] > 1 else ''} had errors" + if progress["errors"] + else None, + ) + profiling_run.save() + + profiling_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_column_profiling(column, sampling_params.get(column.table_name)) for column in data_chars], + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_column_progress, + ) + + if error_count := len(error_data): + LOG.warning(f"Errors running column profiling queries: {error_count}") + LOG.info("Writing column profiling errors") + error_results = sql_generator.get_profiling_errors( + [(data_chars[index], error) for index, error in error_data.items()] + ) + write_to_app_db(error_results, sql_generator.error_columns, sql_generator.profiling_results_table) + + if not profiling_results: # All queries failed, so stop the process + raise RuntimeError(f"{error_count} errors during column profiling. See details in results.") + + LOG.info("Writing column profiling results") + write_to_app_db(profiling_results, result_columns, sql_generator.profiling_results_table) + + if sampling_params: + try: + LOG.info("Updating sampled profiling results") + execute_db_queries( + [ + sql_generator.update_sampled_profiling_results(table_sampling) + for table_sampling in sampling_params.values() + ] + ) + except Exception as e: + raise RuntimeError(f"Error updating sampled profiling results. {get_exception_message(e)}") from e + + profiling_run.set_progress( + "col_profiling", + "Warning" if error_count else "Completed", + error=f"{error_count} column{'s' if error_count > 1 else ''} had errors. See details in results." + if error_count + else None, + ) + + +def _run_frequency_analysis(sql_generator: ProfilingSQL) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("freq_analysis", "Running") + profiling_run.save() + + error_data = None + try: + LOG.info("Selecting columns for frequency analysis") + frequency_columns = fetch_dict_from_db(*sql_generator.get_frequency_analysis_columns()) + + if frequency_columns: + LOG.info(f"Running frequency analysis queries: {len(frequency_columns)}") + + def update_frequency_progress(progress: ThreadedProgress) -> None: + profiling_run.set_progress( + "freq_analysis", "Running", detail=f"{progress['processed']} of {progress['total']}" + ) + profiling_run.save() + + frequency_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_frequency_analysis(ColumnChars(**column)) for column in frequency_columns], + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_frequency_progress, + ) + if error_data: + LOG.warning(f"Errors running frequency analysis queries: {len(error_data)}") + + if frequency_results: + LOG.info("Writing frequency results to staging") + write_to_app_db(frequency_results, result_columns, sql_generator.frequency_staging_table) + + LOG.info("Updating profiling results with frequency analysis and deleting staging") + execute_db_queries(sql_generator.update_frequency_analysis_results()) + except Exception as e: + profiling_run.set_progress("freq_analysis", "Warning", error=f"Error encountered. {get_exception_message(e)}") + else: + if error_data: + profiling_run.set_progress( + "freq_analysis", "Warning", error=f"Error encountered. {next(iter(error_data.values()))}" + ) + else: + profiling_run.set_progress("freq_analysis", "Completed") + + +def _run_hygiene_issue_detection(sql_generator: ProfilingSQL) -> None: + profiling_run = sql_generator.profiling_run + profiling_run.set_progress("hygiene_issues", "Running") + profiling_run.save() + + try: + LOG.info("Detecting functional data types and critical data elements") + execute_db_queries(sql_generator.update_profiling_results()) + + LOG.info("Retrieving hygiene issue types") + hygiene_issue_types = fetch_dict_from_db(*sql_generator.get_hygiene_issue_types()) + hygiene_issue_types = [HygieneIssueType(**item) for item in hygiene_issue_types] + + LOG.info("Detecting hygiene issues and updating prevalence and counts") + execute_db_queries( + [ + *[ + query + for issue_type in hygiene_issue_types + if (query := sql_generator.detect_hygiene_issue(issue_type)) + ], + *[ + sql_generator.update_hygiene_issue_prevalence(issue_type) + for issue_type in hygiene_issue_types + if issue_type.dq_score_prevalence_formula + ], + sql_generator.update_hygiene_issue_counts(), + ] + ) + except Exception as e: + profiling_run.set_progress("hygiene_issues", "Warning", error=f"Error encountered. {get_exception_message(e)}") + else: + profiling_run.set_progress("hygiene_issues", "Completed") + + +def _rollup_profiling_scores(profiling_run: ProfilingRun, table_group: TableGroup) -> None: + try: + LOG.info("Rolling up profiling scores") + execute_db_queries( + RollupScoresSQL(profiling_run.id, table_group.id).rollup_profiling_scores(), + ) + run_refresh_score_cards_results( + project_code=table_group.project_code, + add_history_entry=True, + refresh_date=profiling_run.profiling_starttime, + ) + except Exception: + LOG.exception("Error rolling up profiling scores") + + +@with_database_session +def _generate_monitor_tests(table_group_id: str, test_suite_id: str) -> None: + try: + monitor_test_suite = TestSuite.get(test_suite_id) + if not monitor_test_suite: + LOG.info("Skipping test generation on missing monitor test suite") + else: + LOG.info("Generating monitor tests") + run_test_gen_queries(table_group_id, monitor_test_suite.test_suite, "Monitor") + run_test_execution_in_background(test_suite_id) + except Exception: + LOG.exception("Error generating monitor tests") diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py deleted file mode 100644 index 236985bc..00000000 --- a/testgen/commands/run_profiling_bridge.py +++ /dev/null @@ -1,499 +0,0 @@ -import logging -import subprocess -import threading -import uuid -from datetime import UTC, datetime - -import pandas as pd -from progress.spinner import Spinner - -import testgen.common.process_service as process_service -from testgen import settings -from testgen.commands.queries.profiling_query import CProfilingSQL -from testgen.commands.run_execute_tests import run_execution_steps_in_background -from testgen.commands.run_generate_tests import run_test_gen_queries -from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results -from testgen.common import ( - date_service, - execute_db_queries, - fetch_dict_from_db, - fetch_from_db_threaded, - get_profiling_params, - quote_csv_items, - set_target_db_params, - write_to_app_db, -) -from testgen.common.database.database_service import empty_cache, get_flavor_service -from testgen.common.mixpanel_service import MixpanelService -from testgen.common.models import with_database_session -from testgen.common.models.connection import Connection -from testgen.common.models.test_suite import TestSuite -from testgen.ui.session import session - -LOG = logging.getLogger("testgen") - - -def save_contingency_rules(df_merged, threshold_ratio): - # Prep rows to save - lst_rules = [] - for row in df_merged.itertuples(): - # First causes second: almost all of first coincide with second value - if row.pair_to_first_ratio >= threshold_ratio: - profiling_run_id = row.profiling_run_id - schema_name = row.schema_name - table_name = row.table_name - cause_column_name = row.first_column_name - cause_column_value = getattr(row, row.first_column_name) - effect_column_name = row.second_column_name - effect_column_value = getattr(row, row.second_column_name) - pair_count = row.pair_count - cause_column_total = row.first_column_overall_count - effect_column_total = row.second_column_overall_count - rule_ratio = row.pair_to_first_ratio - lst_rules.append( - [ - profiling_run_id, - schema_name, - table_name, - cause_column_name, - cause_column_value, - effect_column_name, - effect_column_value, - pair_count, - cause_column_total, - effect_column_total, - rule_ratio, - ] - ) - - # Second causes first: almost all of second coincide with first value - if row.pair_to_second_ratio >= threshold_ratio: - profiling_run_id = row.profiling_run_id - schema_name = row.schema_name - table_name = row.table_name - cause_column_name = row.second_column_name - cause_column_value = getattr(row, row.second_column_name) - effect_column_name = row.first_column_name - effect_column_value = getattr(row, row.first_column_name) - pair_count = row.pair_count - cause_column_total = row.second_column_overall_count - effect_column_total = row.first_column_overall_count - rule_ratio = row.pair_to_second_ratio - lst_rules.append( - [ - profiling_run_id, - schema_name, - table_name, - cause_column_name, - cause_column_value, - effect_column_name, - effect_column_value, - pair_count, - cause_column_total, - effect_column_total, - rule_ratio, - ] - ) - - write_to_app_db( - lst_rules, - [ - "profile_run_id", - "schema_name", - "table_name", - "cause_column_name", - "cause_column_value", - "effect_column_name", - "effect_column_value", - "pair_count", - "cause_column_total", - "effect_column_total", - "rule_ratio", - ], - "profile_pair_rules", - ) - - -def RunPairwiseContingencyCheck(clsProfiling: CProfilingSQL, threshold_ratio: float): - # Goal: identify pairs of values that represent IF X=A THEN Y=B rules - - # Define the threshold percent -- should be high - if threshold_ratio: - threshold_ratio = threshold_ratio / 100.0 - else: - threshold_ratio = 0.95 - str_max_values = "6" - - # Retrieve columns to include in list from profiing results - clsProfiling.contingency_max_values = str_max_values - lst_tables = fetch_dict_from_db(*clsProfiling.GetContingencyColumns()) - - # Retrieve record counts per column combination - df_merged = None - if lst_tables: - for dct_table in lst_tables: - df_merged = None - clsProfiling.data_schema = dct_table["schema_name"] - clsProfiling.data_table = dct_table["table_name"] - clsProfiling.contingency_columns = quote_csv_items(dct_table["contingency_columns"]) - lst_counts = fetch_dict_from_db(*clsProfiling.GetContingencyCounts(), use_target_db=True) - if lst_counts: - df = pd.DataFrame(lst_counts) - # Get list of columns - columns = dct_table["contingency_columns"].lower().split(",") - - # Calculate overall counts for each column - overall_counts = {col: df.groupby(col)["freq_ct"].sum() for col in columns} - - # Prepare to aggregate the data - contingency_table = [] - for i, col1 in enumerate(columns): - for col2 in columns[i + 1 :]: - # Create a pivot table for each pair - pivot = df.pivot_table(index=col1, columns=col2, values="freq_ct", aggfunc="sum", fill_value=0) - pivot = pivot.stack().reset_index() - pivot.rename(columns={0: "pair_count"}, inplace=True) - - # Add overall counts - pivot["first_column_overall_count"] = pivot[col1].map(overall_counts[col1]) - pivot["second_column_overall_count"] = pivot[col2].map(overall_counts[col2]) - - # Add column names - pivot["first_column_name"] = col1 - pivot["second_column_name"] = col2 - - contingency_table.append(pivot) - - # Combine all pairs into a single DataFrame - contingency_table = pd.concat(contingency_table, ignore_index=True) - - # Calculate the ratios - contingency_table["pair_to_first_ratio"] = ( - contingency_table["pair_count"] / contingency_table["first_column_overall_count"] - ) - contingency_table["pair_to_second_ratio"] = ( - contingency_table["pair_count"] / contingency_table["second_column_overall_count"] - ) - - # Include rows where both cols meet minimum threshold count (max of 30 or 5%) - total_observations = contingency_table["pair_count"].sum() - threshold_min = max(total_observations * 0.05, 30) - contingency_table = contingency_table[ - (contingency_table["first_column_overall_count"] >= threshold_min) - & (contingency_table["second_column_overall_count"] >= threshold_min) - ] - # Drop rows where neither ratio meets the threshold ratio (keep if either meets it) - # -- note we still have to check individual columns when saving pairs - contingency_table = contingency_table[ - ~( - (contingency_table["pair_to_first_ratio"] < threshold_ratio) - & (contingency_table["pair_to_second_ratio"] < threshold_ratio) - ) - ] - - # Add table name - contingency_table["profiling_run_id"] = clsProfiling.profile_run_id - contingency_table["schema_name"] = dct_table["schema_name"] - contingency_table["table_name"] = dct_table["table_name"] - - # Combine with previous tables - if df_merged == None: - df_merged = contingency_table - else: - df_merged = pd.concat([df_merged, contingency_table], ignore_index=True) - - if df_merged is not None: - if not df_merged.empty: - save_contingency_rules(df_merged, threshold_ratio) - - -def run_profiling_in_background(table_group_id): - msg = f"Starting run_profiling_in_background against table group_id: {table_group_id}" - if settings.IS_DEBUG: - LOG.info(msg + ". Running in debug mode (new thread instead of new process).") - empty_cache() - background_thread = threading.Thread( - target=run_profiling_queries, - args=(table_group_id, session.auth.user_display if session.auth else None), - ) - background_thread.start() - else: - LOG.info(msg) - script = ["testgen", "run-profile", "-tg", str(table_group_id)] - subprocess.Popen(script) # NOQA S603 - - -@with_database_session -def run_profiling_queries(table_group_id: str, username: str | None = None, spinner: Spinner | None = None, minutes_offset: int = 0): - if table_group_id is None: - raise ValueError("Table Group ID was not specified") - - has_errors = False - - # Set Project Connection Parms in common.db_bridgers from retrieved parms - LOG.info("CurrentStep: Assigning Connection Parameters") - connection = Connection.get_by_table_group(table_group_id) - set_target_db_params(connection.__dict__) - - LOG.info("CurrentStep: Retrieving Parameters") - - # Generate UUID for Profile Run ID - profiling_run_id = str(uuid.uuid4()) - - params = get_profiling_params(table_group_id) - needs_monitor_tests_generated = ( - bool(params["monitor_test_suite_id"]) and not params["last_complete_profile_run_id"] - ) - - LOG.info("CurrentStep: Initializing Query Generator") - clsProfiling = CProfilingSQL(params["project_code"], connection.sql_flavor, minutes_offset=minutes_offset) - - # Set General Parms - clsProfiling.table_groups_id = table_group_id - clsProfiling.connection_id = connection.connection_id - clsProfiling.profile_run_id = profiling_run_id - clsProfiling.data_schema = params["table_group_schema"] - clsProfiling.parm_table_set = params["profiling_table_set"] - clsProfiling.parm_table_include_mask = params["profiling_include_mask"] - clsProfiling.parm_table_exclude_mask = params["profiling_exclude_mask"] - clsProfiling.profile_id_column_mask = params["profile_id_column_mask"] - clsProfiling.profile_sk_column_mask = params["profile_sk_column_mask"] - clsProfiling.profile_use_sampling = params["profile_use_sampling"] - clsProfiling.profile_flag_cdes = params["profile_flag_cdes"] - clsProfiling.profile_sample_percent = params["profile_sample_percent"] - clsProfiling.profile_sample_min_count = params["profile_sample_min_count"] - clsProfiling.process_id = process_service.get_current_process_id() - - # Add a record in profiling_runs table for the new profile - execute_db_queries([clsProfiling.GetProfileRunInfoRecordsQuery()]) - if spinner: - spinner.next() - - table_count = 0 - column_count = 0 - try: - # Retrieve Column Metadata - LOG.info("CurrentStep: Getting DDF from project") - - lstResult = fetch_dict_from_db(*clsProfiling.GetDDFQuery(), use_target_db=True) - column_count = len(lstResult) - - if lstResult: - flavor_service = get_flavor_service(connection.sql_flavor) - quote = flavor_service.quote_character - - # Get distinct tables - distinct_tables = set() - for item in lstResult: - schema_name = item["table_schema"] - table_name = item["table_name"] - distinct_tables.add(f"{quote}{schema_name}{quote}.{quote}{table_name}{quote}") - - # Convert the set to a list - distinct_tables_list = list(distinct_tables) - table_count = len(distinct_tables_list) - - if clsProfiling.profile_use_sampling == "Y": - # Sampling tables - lstQueries = [] - for parm_sampling_table in distinct_tables_list: - clsProfiling.sampling_table = parm_sampling_table - lstQueries.append(clsProfiling.GetTableSampleCount()) - - lstSampleTables, _, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - dctSampleTables = {x[0]: [x[1], x[2], x[3]] for x in lstSampleTables} - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered retrieving sampling table counts. ({intErrors} errors occurred.) Please check log." - ) - - # Assemble profiling queries - if spinner: - spinner.next() - LOG.info("CurrentStep: Assembling profiling queries, round 1") - lstQueries = [] - for dctColumnRecord in lstResult: - # Set Column Parms - clsProfiling.data_schema = dctColumnRecord["table_schema"] - clsProfiling.data_table = dctColumnRecord["table_name"] - clsProfiling.col_name = dctColumnRecord["column_name"] - clsProfiling.col_type = dctColumnRecord["column_type"] - clsProfiling.db_data_type = dctColumnRecord["db_data_type"] - clsProfiling.profile_run_id = profiling_run_id - clsProfiling.col_is_decimal = dctColumnRecord["is_decimal"] - clsProfiling.col_ordinal_position = dctColumnRecord["ordinal_position"] - clsProfiling.col_gen_type = dctColumnRecord["general_type"] - clsProfiling.parm_do_sample = "N" - - if clsProfiling.profile_use_sampling == "Y": - table_identifier = f"{quote}{clsProfiling.data_schema}{quote}.{quote}{clsProfiling.data_table}{quote}" - if dctSampleTables[table_identifier][0] > -1: - clsProfiling.parm_sample_size = dctSampleTables[table_identifier][0] - clsProfiling.sample_ratio = dctSampleTables[table_identifier][1] - clsProfiling.sample_percent_calc = dctSampleTables[table_identifier][2] - clsProfiling.parm_do_sample = clsProfiling.profile_use_sampling - else: - clsProfiling.parm_sample_size = 0 - clsProfiling.sample_ratio = "" - clsProfiling.sample_percent_calc = "" - - lstQueries.append(clsProfiling.GetProfilingQuery()) - - # Run Profiling Queries and save results - LOG.info("CurrentStep: Profiling Round 1") - LOG.debug("Running %s profiling queries", len(lstQueries)) - - lstProfiles, colProfileNames, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered executing profiling queries. ({intErrors} errors occurred.) Please check log." - ) - LOG.info("CurrentStep: Saving Round 1 profiling results to Metadata") - write_to_app_db(lstProfiles, colProfileNames, "profile_results") - - if clsProfiling.profile_use_sampling == "Y": - lstQueries = [] - for table_name, value in dctSampleTables.items(): - if value[0] > -1: - clsProfiling.sampling_table = table_name - clsProfiling.sample_ratio = value[1] - lstQueries.append(clsProfiling.UpdateProfileResultsToEst()) - - execute_db_queries(lstQueries) - - if clsProfiling.parm_do_freqs == "Y": - lstUpdates = [] - # Get secondary profiling columns - LOG.info("CurrentStep: Selecting columns for frequency analysis") - lstResult = fetch_dict_from_db(*clsProfiling.GetSecondProfilingColumnsQuery()) - - if lstResult: - # Assemble secondary profiling queries - # - Freqs for columns not already freq'd, but with max actual value length under threshold - LOG.info("CurrentStep: Generating frequency queries") - lstQueries = [] - for dctColumnRecord in lstResult: - clsProfiling.data_schema = dctColumnRecord["schema_name"] - clsProfiling.data_table = dctColumnRecord["table_name"] - clsProfiling.col_name = dctColumnRecord["column_name"] - - lstQueries.append(clsProfiling.GetSecondProfilingQuery()) - # Run secondary profiling queries - LOG.info("CurrentStep: Retrieving %s frequency results from project", len(lstQueries)) - lstUpdates, colProfileNames, intErrors = fetch_from_db_threaded( - lstQueries, use_target_db=True, max_threads=connection.max_threads, spinner=spinner - ) - if intErrors > 0: - has_errors = True - LOG.warning( - f"Errors were encountered executing frequency queries. ({intErrors} errors occurred.) Please check log." - ) - - if lstUpdates: - # Copy secondary results to DQ staging - LOG.info("CurrentStep: Writing frequency results to Staging") - write_to_app_db(lstUpdates, colProfileNames, "stg_secondary_profile_updates") - - LOG.info("CurrentStep: Generating profiling update queries") - - lstQueries = [] - lstAnomalyTypes = [] - - if lstUpdates: - # Run single update query, then delete from staging - lstQueries.extend([ - clsProfiling.GetSecondProfilingUpdateQuery(), - clsProfiling.GetSecondProfilingStageDeleteQuery(), - ]) - lstQueries.extend([ - clsProfiling.GetDataTypeSuggestionUpdateQuery(), - clsProfiling.GetFunctionalDataTypeUpdateQuery(), - clsProfiling.GetFunctionalTableTypeStageQuery(), - clsProfiling.GetFunctionalTableTypeUpdateQuery(), - clsProfiling.GetPIIFlagUpdateQuery(), - ]) - - lstAnomalyTypes = fetch_dict_from_db(*clsProfiling.GetAnomalyTestTypesQuery()) - lstQueries.extend([ - query for test_type in lstAnomalyTypes if (query := clsProfiling.GetAnomalyTestQuery(test_type)) - ]) - lstQueries.extend([ - clsProfiling.GetAnomalyScoringQuery(test_type) - for test_type in lstAnomalyTypes - if test_type["dq_score_prevalence_formula"] - ]) - lstQueries.append(clsProfiling.GetAnomalyStatsRefreshQuery()) - - # Always runs last - lstQueries.append(clsProfiling.GetDataCharsRefreshQuery()) - if clsProfiling.profile_flag_cdes: - lstQueries.append(clsProfiling.GetCDEFlaggerQuery()) - - LOG.info("CurrentStep: Running profiling update queries") - execute_db_queries(lstQueries) - - if params["profile_do_pair_rules"] == "Y": - LOG.info("CurrentStep: Compiling pairwise contingency rules") - RunPairwiseContingencyCheck(clsProfiling, params["profile_pair_rule_pct"]) - else: - LOG.info("No columns were selected to profile.") - except Exception as e: - has_errors = True - sqlsplit = e.args[0].split("[SQL", 1) - errorline = sqlsplit[0].replace("'", "''") if len(sqlsplit) > 0 else "unknown error" - clsProfiling.exception_message = f"{type(e).__name__}: {errorline}" - raise - finally: - LOG.info("Updating the profiling run record") - execute_db_queries([clsProfiling.GetProfileRunInfoRecordUpdateQuery()]) - end_time = datetime.now(UTC) - - execute_db_queries([ - clsProfiling.GetAnomalyScoringRollupRunQuery(), - clsProfiling.GetAnomalyScoringRollupTableGroupQuery(), - ]) - run_refresh_score_cards_results( - project_code=params["project_code"], - add_history_entry=True, - refresh_date=date_service.parse_now(clsProfiling.run_date), - ) - - MixpanelService().send_event( - "run-profiling", - source=settings.ANALYTICS_JOB_SOURCE, - username=username, - sql_flavor=clsProfiling.flavor, - sampling=clsProfiling.profile_use_sampling == "Y", - table_count=table_count, - column_count=column_count, - run_duration=(end_time - date_service.parse_now(clsProfiling.run_date)).total_seconds(), - scoring_duration=(datetime.now(UTC) - end_time).total_seconds(), - ) - - if needs_monitor_tests_generated: - _generate_monitor_tests(params["project_code"], table_group_id, params["monitor_test_suite_id"]) - - return f""" - Profiling completed {"with errors. Check log for details." if has_errors else "successfully."} - Run ID: {profiling_run_id} - """ - - -@with_database_session -def _generate_monitor_tests(project_code: str, table_group_id: str, test_suite_id: str) -> None: - try: - monitor_test_suite = TestSuite.get(test_suite_id) - if not monitor_test_suite: - LOG.info("Skipping test generation on missing monitor test suite") - else: - LOG.info("Generating monitor tests") - run_test_gen_queries(table_group_id, monitor_test_suite.test_suite, "Monitor") - run_execution_steps_in_background(project_code, monitor_test_suite.test_suite) - except Exception: - LOG.exception("Error generating monitor tests") diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index fd973d95..5c9ea325 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -12,6 +12,9 @@ set_target_db_params, ) from testgen.common.database.flavor.flavor_service import ConnectionParams +from testgen.common.models import with_database_session +from testgen.common.models.scores import ScoreDefinition +from testgen.common.models.table_group import TableGroup from testgen.common.read_file import read_template_sql_file LOG = logging.getLogger("testgen") @@ -135,6 +138,14 @@ def run_quick_start(delete_target_db: bool) -> None: use_target_db=True, ) + score_definition = ScoreDefinition.from_table_group( + TableGroup( + project_code=settings.PROJECT_KEY, + table_groups_name=settings.DEFAULT_TABLE_GROUPS_NAME, + ) + ) + with_database_session(score_definition.save)() + def run_quick_start_increment(iteration): params_mapping = _get_params_mapping(iteration) diff --git a/testgen/commands/run_refresh_data_chars.py b/testgen/commands/run_refresh_data_chars.py index 2c812559..a972f7f1 100644 --- a/testgen/commands/run_refresh_data_chars.py +++ b/testgen/commands/run_refresh_data_chars.py @@ -1,83 +1,59 @@ import logging +from datetime import datetime -from progress.spinner import Spinner - -from testgen.commands.queries.refresh_data_chars_query import CRefreshDataCharsSQL +from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL from testgen.common.database.database_service import ( execute_db_queries, fetch_dict_from_db, fetch_from_db_threaded, - get_flavor_service, write_to_app_db, ) -from testgen.common.get_pipeline_parms import TestExecutionParams +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.utils import get_exception_message LOG = logging.getLogger("testgen") -STAGING_TABLE = "stg_data_chars_updates" - -def run_refresh_data_chars_queries(params: TestExecutionParams, run_date: str, spinner: Spinner=None): - LOG.info("CurrentStep: Initializing Data Characteristics Refresh") - sql_generator = CRefreshDataCharsSQL(params, run_date, STAGING_TABLE) - flavor_service = get_flavor_service(params["sql_flavor"]) - quote = flavor_service.quote_character - LOG.info("CurrentStep: Getting DDF for table group") - ddf_results = fetch_dict_from_db(*sql_generator.GetDDFQuery(), use_target_db=True) - - distinct_tables = { - f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}" - for item in ddf_results - } - if distinct_tables: - count_queries = sql_generator.GetRecordCountQueries(distinct_tables) +def run_data_chars_refresh(connection: Connection, table_group: TableGroup, run_date: datetime) -> list[ColumnChars]: + sql_generator = RefreshDataCharsSQL(connection, table_group) + + LOG.info("Getting DDF for table group") + try: + data_chars = fetch_dict_from_db(*sql_generator.get_schema_ddf(), use_target_db=True) + except Exception as e: + raise RuntimeError(f"Error refreshing columns for data catalog. {get_exception_message(e)}") from e + + data_chars = [ColumnChars(**column) for column in data_chars] + if data_chars: + distinct_tables = {column.table_name for column in data_chars} + LOG.info(f"Tables: {len(distinct_tables)}, Columns: {len(data_chars)}") + count_queries = sql_generator.get_row_counts(distinct_tables) - LOG.info("CurrentStep: Getting record counts for table group") - count_results, _, error_count = fetch_from_db_threaded( - count_queries, use_target_db=True, max_threads=params["max_threads"], spinner=spinner + LOG.info("Getting row counts for table group") + count_results, _, error_data = fetch_from_db_threaded( + count_queries, use_target_db=True, max_threads=connection.max_threads, ) - if error_count: - LOG.warning(f"{error_count} errors were encountered while retrieving record counts.") + + count_map = dict(count_results) + for column in data_chars: + column.record_ct = count_map.get(column.table_name) + + write_data_chars(data_chars, sql_generator, run_date) + + if error_data: + raise RuntimeError(f"Error refreshing row counts for data catalog. {next(iter(error_data.values()))}") else: - count_results = [] - LOG.warning("No tables detected in table group. Skipping retrieval of record counts") + LOG.warning("No tables detected in table group") + + return data_chars + - count_map = dict(count_results) - staging_columns = [ - "project_code", - "table_groups_id", - "run_date", - "schema_name", - "table_name", - "column_name", - "position", - "general_type", - "column_type", - "db_data_type", - "record_ct", - ] - staging_records = [ - [ - item["project_code"], - params["table_groups_id"], - run_date, - item["table_schema"], - item["table_name"], - item["column_name"], - item["ordinal_position"], - item["general_type"], - item["column_type"], - item["db_data_type"], - count_map.get(f"{quote}{item['table_schema']}{quote}.{quote}{item['table_name']}{quote}", 0), - ] - for item in ddf_results - ] +def write_data_chars(data_chars: list[ColumnChars], sql_generator: RefreshDataCharsSQL, run_date: datetime) -> None: + staging_results = sql_generator.get_staging_data_chars(data_chars, run_date) - LOG.info("CurrentStep: Writing data characteristics to staging") - write_to_app_db(staging_records, staging_columns, STAGING_TABLE) + LOG.info("Writing data characteristics to staging") + write_to_app_db(staging_results, sql_generator.staging_columns, sql_generator.staging_table) - LOG.info("CurrentStep: Refreshing data characteristics and deleting staging") - execute_db_queries([ - sql_generator.GetDataCharsUpdateQuery(), - sql_generator.GetStagingDeleteQuery(), - ]) + LOG.info("Refreshing data characteristics and deleting staging") + execute_db_queries(sql_generator.update_data_chars(run_date)) diff --git a/testgen/commands/run_refresh_score_cards_results.py b/testgen/commands/run_refresh_score_cards_results.py index 7d56c6b5..7f0015f8 100644 --- a/testgen/commands/run_refresh_score_cards_results.py +++ b/testgen/commands/run_refresh_score_cards_results.py @@ -24,7 +24,6 @@ def run_refresh_score_cards_results( ): start_time = time.time() _refresh_date = refresh_date or datetime.datetime.now(datetime.UTC) - LOG.info("CurrentStep: Initializing scorecards results refresh") try: definitions = [] @@ -33,32 +32,31 @@ def run_refresh_score_cards_results( else: definitions.append(ScoreDefinition.get(str(definition_id))) except Exception: - LOG.exception("CurrentStep: Stopping scorecards results refresh after unexpected error") + LOG.exception("Stopping scorecards results refresh after unexpected error") return db_session = get_current_session() + for definition in definitions: LOG.info( - "CurrentStep: Refreshing results for scorecard %s in project %s", + "Refreshing results for scorecard %s in project %s", definition.name, definition.project_code, ) try: fresh_score_card = definition.as_score_card() - definition.results = [] - definition.breakdown = [] - db_session.flush([definition]) - + definition.clear_results() definition.results = _score_card_to_results(fresh_score_card) definition.breakdown = _score_definition_to_results_breakdown(definition) if add_history_entry: - LOG.info( - "CurrentStep: Adding history entry for scorecard %s in project %s", + LOG.debug( + "Adding history entry for scorecard %s in project %s", definition.name, definition.project_code, ) + last_added_entry = None historical_categories = ["score", "cde_score"] for result in definition.results: if result.category in historical_categories: @@ -66,19 +64,18 @@ def run_refresh_score_cards_results( definition_id=result.definition_id, category=result.category, score=result.score, - last_run_time=_refresh_date, + last_run_time=_refresh_date.replace(tzinfo=None), ) - definition.history.append(history_entry) - history_entry.add_as_cutoff() + db_session.add(history_entry) + db_session.flush([history_entry]) + last_added_entry = history_entry + + if last_added_entry: + last_added_entry.add_as_cutoff() definition.save() - LOG.info( - "CurrentStep: Done refreshing scorecard %s in project %s", - definition.name, - definition.project_code, - ) except Exception: LOG.exception( - "CurrentStep: Unexpected error refreshing scorecard %s in project %s", + "Error refreshing scorecard %s in project %s", definition.name, definition.project_code, ) @@ -90,7 +87,7 @@ def run_refresh_score_cards_results( scope = f"scorecard {definition_id}" end_time = time.time() - LOG.info("CurrentStep: Refreshing results for %s is over after %s seconds", scope, round(end_time - start_time, 2)) + LOG.info("Refreshing results for %s done after %s seconds", scope, round(end_time - start_time, 2)) def _score_card_to_results(score_card: ScoreCard) -> list[ScoreDefinitionResult]: diff --git a/testgen/commands/run_rollup_scores.py b/testgen/commands/run_rollup_scores.py index e835571e..1676504f 100644 --- a/testgen/commands/run_rollup_scores.py +++ b/testgen/commands/run_rollup_scores.py @@ -1,6 +1,6 @@ import logging -from testgen.commands.queries.rollup_scores_query import CRollupScoresSQL +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results from testgen.common.database.database_service import execute_db_queries @@ -8,26 +8,14 @@ def run_profile_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): - LOG.info("CurrentStep: Initializing Profiling Scores Rollup") - sql_generator = CRollupScoresSQL(run_id, table_group_id) - - queries = [sql_generator.GetRollupScoresProfileRunQuery()] - if table_group_id: - queries.append(sql_generator.GetRollupScoresProfileTableGroupQuery()) - - LOG.info("CurrentStep: Rolling up profiling scores") - execute_db_queries(queries) + sql_generator = RollupScoresSQL(run_id, table_group_id) + execute_db_queries(sql_generator.rollup_profiling_scores()) run_refresh_score_cards_results(project_code=project_code) def run_test_rollup_scoring_queries(project_code: str, run_id: str, table_group_id: str | None = None): - LOG.info("CurrentStep: Initializing Testing Scores Rollup") - sql_generator = CRollupScoresSQL(run_id, table_group_id) - - queries = [sql_generator.GetRollupScoresTestRunQuery()] - if table_group_id: - queries.append(sql_generator.GetRollupScoresTestTableGroupQuery()) - - LOG.info("CurrentStep: Rolling up testing scores") - execute_db_queries(queries) + sql_generator = RollupScoresSQL(run_id, table_group_id) + execute_db_queries( + sql_generator.rollup_test_scores(update_table_group=table_group_id is not None) + ) run_refresh_score_cards_results(project_code=project_code) diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py new file mode 100644 index 00000000..bb91f70c --- /dev/null +++ b/testgen/commands/run_test_execution.py @@ -0,0 +1,322 @@ +import logging +import subprocess +import threading +from datetime import UTC, datetime, timedelta +from functools import partial +from typing import Literal +from uuid import UUID + +import testgen.common.process_service as process_service +from testgen import settings +from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL +from testgen.commands.queries.rollup_scores_query import RollupScoresSQL +from testgen.commands.run_refresh_score_cards_results import run_refresh_score_cards_results +from testgen.common import ( + execute_db_queries, + fetch_dict_from_db, + fetch_from_db_threaded, + set_target_db_params, + write_to_app_db, +) +from testgen.common.database.database_service import ThreadedProgress, empty_cache +from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models import with_database_session +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite +from testgen.ui.session import session +from testgen.utils import get_exception_message + +from .run_refresh_data_chars import run_data_chars_refresh +from .run_test_validation import run_test_validation + +LOG = logging.getLogger("testgen") + + +def run_test_execution_in_background(test_suite_id: str | UUID): + msg = f"Triggering test run for test suite {test_suite_id}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread( + target=run_test_execution, + args=(test_suite_id, session.auth.user_display if session.auth else None), + ) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-tests", "--test-suite-id", str(test_suite_id)] + subprocess.Popen(script) # NOQA S603 + + +@with_database_session +def run_test_execution(test_suite_id: str | UUID, username: str | None = None, run_date: datetime | None = None) -> str: + if test_suite_id is None: + raise ValueError("Test Suite ID was not specified") + + LOG.info(f"Starting test run for test suite {test_suite_id}") + time_delta = (run_date - datetime.now(UTC)) if run_date else timedelta() + + LOG.info("Retrieving connection, table group, and test suite parameters") + test_suite = TestSuite.get(test_suite_id) + table_group = TableGroup.get(test_suite.table_groups_id) + connection = Connection.get(table_group.connection_id) + set_target_db_params(connection.__dict__) + + LOG.info("Creating test run record") + test_run = TestRun( + test_suite_id=test_suite_id, + test_starttime=datetime.now(UTC) + time_delta, + process_id=process_service.get_current_process_id(), + ) + test_run.init_progress() + test_run.set_progress("data_chars", "Running") + test_run.save() + + try: + LOG.info(f"Test run: {test_run.id}, Test suite: {test_suite.test_suite}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") + data_chars = run_data_chars_refresh(connection, table_group, test_run.test_starttime) + test_run.set_progress("data_chars", "Completed") + + sql_generator = TestExecutionSQL(connection, table_group, test_run) + + LOG.info("Retrieving active test definitions in test suite") + test_defs = fetch_dict_from_db(*sql_generator.get_active_test_definitions()) + test_defs = [TestExecutionDef(**item) for item in test_defs] + + if test_defs: + LOG.info(f"Active test definitions: {len(test_defs)}") + test_run.set_progress("validation", "Running") + test_run.save() + + valid_test_defs = run_test_validation(sql_generator, test_defs) + invalid_count = len(test_defs) - len(valid_test_defs) + test_run.set_progress( + "validation", + "Warning" if invalid_count else "Completed", + error=f"{invalid_count} test{'s' if invalid_count > 1 else ''} had errors. See details in results." if invalid_count else None, + ) + + if valid_test_defs: + LOG.info("Updating historic test thresholds") + execute_db_queries([sql_generator.update_historic_thresholds()]) + + column_types = {(col.schema_name, col.table_name, col.column_name): col.column_type for col in data_chars} + for td in valid_test_defs: + td.column_type = column_types.get((td.schema_name, td.table_name, td.column_name)) + + run_functions = { + "QUERY": partial(_run_tests, sql_generator, "QUERY"), + "METADATA": partial(_run_tests, sql_generator, "METADATA"), + "CAT": partial(_run_cat_tests, sql_generator), + } + # Run metadata tests last so that results for other tests are available to them + # TODO: TURN ON WHEN ADDING METADATA TESTS + # for run_type in ["QUERY", "CAT", "METADATA"]: + for run_type in ["QUERY", "CAT"]: + if (run_test_defs := [td for td in valid_test_defs if td.run_type == run_type]): + run_functions[run_type](run_test_defs) + else: + test_run.set_progress(run_type, "Completed") + LOG.info(f"No {run_type} tests to run") + else: + LOG.info("No valid tests to run") + else: + LOG.info("No active tests to run") + + LOG.info("Updating test results and test run") + test_run.save() + execute_db_queries(sql_generator.update_test_results()) + # Refresh needed because previous query updates the test run too + test_run.refresh() + except Exception as e: + LOG.exception("Test execution encountered an error.") + LOG.info("Setting test run status to Error") + test_run.log_message = get_exception_message(e) + test_run.test_endtime = datetime.now(UTC) + time_delta + test_run.status = "Error" + test_run.save() + else: + LOG.info("Setting test run status to Completed") + test_run.test_endtime = datetime.now(UTC) + time_delta + test_run.status = "Complete" + test_run.save() + + LOG.info("Updating latest run for test suite") + test_suite.last_complete_test_run_id = test_run.id + test_suite.save() + + _rollup_test_scores(test_run, table_group) + finally: + MixpanelService().send_event( + "run-tests", + source=settings.ANALYTICS_JOB_SOURCE, + username=username, + sql_flavor=connection.sql_flavor_code, + test_count=test_run.test_ct, + run_duration=(test_run.test_endtime - test_run.test_starttime.replace(tzinfo=UTC)).total_seconds(), + scoring_duration=(datetime.now(UTC) + time_delta - test_run.test_endtime).total_seconds(), + ) + + return f""" + {"Test execution encountered an error. Check log for details." if test_run.status == "Error" else "Test execution completed."} + Run ID: {test_run.id} + """ + + +def _run_tests(sql_generator: TestExecutionSQL, run_type: Literal["QUERY", "METADATA"], test_defs: list[TestExecutionDef]) -> None: + test_run = sql_generator.test_run + test_run.set_progress(run_type, "Running") + test_run.save() + + def update_test_progress(progress: ThreadedProgress) -> None: + test_run.set_progress( + run_type, + "Running", + detail=f"{progress['processed']} of {progress['total']}", + error=f"{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors. See details in results." + if progress["errors"] + else None, + ) + test_run.save() + + LOG.info(f"Running {run_type} tests: {len(test_defs)}") + test_results, result_columns, error_data = fetch_from_db_threaded( + [sql_generator.run_query_test(td) for td in test_defs], + use_target_db=run_type != "METADATA", + max_threads=sql_generator.connection.max_threads, + progress_callback=update_test_progress, + ) + + if test_results: + LOG.info(f"Writing {run_type} test results") + write_to_app_db(test_results, result_columns, sql_generator.test_results_table) + + if error_count := len(error_data): + LOG.warning(f"Errors running {run_type} tests: {error_count}") + LOG.info(f"Writing {run_type} test errors") + for index, error in error_data.items(): + test_defs[index].errors.append(error) + + error_results = sql_generator.get_test_errors(test_defs) + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + test_run.set_progress( + run_type, + "Warning" if error_count else "Completed", + error=f"{error_count} test{'s' if error_count > 1 else ''} had errors" + if error_count + else None, + ) + + +def _run_cat_tests(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> None: + test_run = sql_generator.test_run + test_run.set_progress("CAT", "Running") + test_run.save() + + total_count = len(test_defs) + LOG.info(f"Aggregating CAT tests: {total_count}") + aggregate_queries, aggregate_test_defs = sql_generator.aggregate_cat_tests(test_defs) + + def update_aggegate_progress(progress: ThreadedProgress) -> None: + processed_count = sum(len(aggregate_test_defs[index]) for index in progress["indexes"]) + test_run.set_progress( + "CAT", + "Running", + detail=f"{processed_count} of {total_count}", + error=f"{progress['errors']} {'queries' if progress['errors'] > 1 else 'query'} had errors" + if progress["errors"] + else None, + ) + test_run.save() + + LOG.info(f"Running aggregated CAT test queries: {len(aggregate_queries)}") + aggregate_results, _, aggregate_errors = fetch_from_db_threaded( + aggregate_queries, + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_aggegate_progress, + ) + + if aggregate_results: + LOG.info("Writing aggregated CAT test results") + test_results = sql_generator.get_cat_test_results(aggregate_results, aggregate_test_defs) + write_to_app_db(test_results, sql_generator.result_columns, sql_generator.test_results_table) + + error_count = 0 + if aggregate_errors: + LOG.warning(f"Errors running aggregated CAT test queries: {len(aggregate_errors)}") + error_test_defs: list[TestExecutionDef] = [] + for index in aggregate_errors: + error_test_defs.extend(aggregate_test_defs[index]) + + single_queries, single_test_defs = sql_generator.aggregate_cat_tests(error_test_defs, single=True) + + test_run.set_progress( + "CAT", + "Running", + error="Rerunning errored tests singly", + ) + test_run.save() + + def update_single_progress(progress: ThreadedProgress) -> None: + test_run.set_progress( + "CAT", + "Running", + error=( + f"Rerunning errored tests singly: {progress['processed']} of {progress['total']}" + f"\n{progress['errors']} test{'s' if progress['errors'] > 1 else ''} had errors" if progress["errors"] else "" + ), + ) + test_run.save() + + LOG.info(f"Rerunning errored CAT tests singly: {len(single_test_defs)}") + single_results, _, single_errors = fetch_from_db_threaded( + single_queries, + use_target_db=True, + max_threads=sql_generator.connection.max_threads, + progress_callback=update_single_progress, + ) + + if single_results: + LOG.info("Writing single CAT test results") + test_results = sql_generator.get_cat_test_results(single_results, single_test_defs) + write_to_app_db(test_results, sql_generator.result_columns, sql_generator.test_results_table) + + if error_count := len(single_errors): + LOG.warning(f"Errors running CAT tests singly: {error_count}") + LOG.info("Writing single CAT test errors") + error_test_defs: list[TestExecutionDef] = [] + for index, error in single_errors.items(): + td = single_test_defs[index][0] + td.errors.append(error) + error_test_defs.append(td) + + error_results = sql_generator.get_test_errors(error_test_defs) + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + test_run.set_progress( + "CAT", + "Warning" if error_count else "Completed", + error=f"{error_count} test{'s' if error_count > 1 else ''} had errors. See details in results." + if error_count + else None, + ) + + +def _rollup_test_scores(test_run: TestRun, table_group: TableGroup) -> None: + try: + LOG.info("Rolling up test scores") + sql_generator = RollupScoresSQL(test_run.id, table_group.id) + execute_db_queries( + sql_generator.rollup_test_scores(update_prevalence=True, update_table_group=True), + ) + run_refresh_score_cards_results( + project_code=table_group.project_code, + add_history_entry=True, + refresh_date=test_run.test_starttime, + ) + except Exception: + LOG.exception("Error rolling up test scores") diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py deleted file mode 100644 index f31be1ba..00000000 --- a/testgen/commands/run_test_parameter_validation.py +++ /dev/null @@ -1,119 +0,0 @@ -import logging -from collections import defaultdict -from itertools import chain - -from testgen.commands.queries.test_parameter_validation_query import CTestParamValidationSQL -from testgen.common import ( - execute_db_queries, - fetch_dict_from_db, - fetch_list_from_db, -) -from testgen.common.get_pipeline_parms import TestExecutionParams - -LOG = logging.getLogger("testgen") - - -def run_parameter_validation_queries( - params: TestExecutionParams, - test_run_id: str = "", - test_time: str = "", - test_suite: str = "", -): - LOG.info("CurrentStep: Initializing Test Parameter Validation") - clsExecute = CTestParamValidationSQL(params["sql_flavor"], params["test_suite_id"]) - clsExecute.run_date = test_time - clsExecute.test_run_id = test_run_id - LOG.info("CurrentStep: Validation Class successfully initialized") - - # Retrieve Test Column list - LOG.info("CurrentStep: Retrieve Test Columns for Validation") - test_columns, _ = fetch_list_from_db(*clsExecute.GetTestValidationColumns()) - - invalid_tests = [ test_ids for col, test_ids in test_columns if not col ] - invalid_tests = { item for sublist in invalid_tests for item in sublist } - test_columns = [ item for item in test_columns if item[0] ] - - if not test_columns: - LOG.warning(f"No test columns are present to validate in Test Suite {test_suite}") - missing_columns = [] - missing_tables = set() - else: - # Derive test schema list -- make CSV string from list of columns - # to be used as criteria for retrieving data dictionary - setSchemas = {col.split(".")[0] for col, _ in test_columns} - strSchemas = ", ".join([f"'{value}'" for value in setSchemas]) - - # Retrieve Current Project Column list - LOG.info("CurrentStep: Retrieve Current Columns for Validation") - clsExecute.tg_schema = params["table_group_schema"] - clsExecute.test_schemas = strSchemas - lstProjectTestColumns = fetch_dict_from_db(*clsExecute.GetProjectTestValidationColumns(), use_target_db=True) - - if len(lstProjectTestColumns) == 0: - LOG.info("Current Test Column list is empty") - - LOG.info("CurrentStep: Compare column sets") - # load results into sets - result_set1 = {col.lower() for col, _ in test_columns} - result_set2 = {item["columns"].lower() for item in set(lstProjectTestColumns)} - - # Check if all columns exist in the table - missing_columns = result_set1.difference(result_set2) - missing_columns = [ col for col in missing_columns if col.rsplit(".", 1)[1] ] - if missing_columns: - LOG.info("Missing columns: %s", ", ".join(missing_columns)) - - # Extracting schema.tables that are missing from the result sets - tables_set1 = {elem.rsplit(".", 1)[0] for elem in result_set1} - tables_set2 = {elem.rsplit(".", 1)[0] for elem in result_set2} - - # Check if all the tables exist in the schema - missing_tables = tables_set1.difference(tables_set2) - - if missing_tables: - LOG.info("Missing tables: %s", ", ".join(missing_tables)) - - if missing_columns or missing_tables or invalid_tests: - # Flag test_definitions tests with missing tables or columns - LOG.info("CurrentStep: Flagging Tests That Failed Validation") - - tests_missing_tables = defaultdict(list) - tests_missing_columns = defaultdict(list) - for column_name, test_ids in test_columns: - column_name = column_name.lower() - table_name = column_name.rsplit(".", 1)[0] - if table_name in missing_tables: - tests_missing_tables[table_name].extend(test_ids) - elif column_name in missing_columns: - tests_missing_columns[column_name].extend(test_ids) - - clsExecute.flag_val = "D" - clsExecute.test_ids = list(set(chain(*tests_missing_tables.values(), *tests_missing_columns.values(), invalid_tests))) - execute_db_queries([clsExecute.PrepFlagTestsWithFailedValidation()]) - - for column_name, test_ids in tests_missing_columns.items(): - clsExecute.message = f"Missing column: {column_name}" - clsExecute.test_ids = test_ids - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - for table_name, test_ids in tests_missing_tables.items(): - clsExecute.message = f"Missing table: {table_name}" - clsExecute.test_ids = test_ids - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - if invalid_tests: - clsExecute.message = "Invalid test: schema, table, or column not defined" - clsExecute.test_ids = invalid_tests - execute_db_queries([clsExecute.FlagTestsWithFailedValidation()]) - - # Copy test results to DK DB, using temporary flagged D value to identify - LOG.info("CurrentStep: Saving error results for invalid tests") - execute_db_queries([clsExecute.ReportTestValidationErrors()]) - - # Set to Inactive those test_definitions tests that are flagged D: set to N - LOG.info("CurrentStep: Disabling Tests That Failed Validation") - execute_db_queries([clsExecute.DisableTestsWithFailedValidation()]) - - LOG.info("Validation Complete: Tests referencing missing tables or columns have been deactivated.") - else: - LOG.info("Validation Successful: No tables or columns missing from target database.") diff --git a/testgen/commands/run_test_validation.py b/testgen/commands/run_test_validation.py new file mode 100644 index 00000000..55fb6185 --- /dev/null +++ b/testgen/commands/run_test_validation.py @@ -0,0 +1,105 @@ +import logging +import re +from uuid import UUID + +from testgen.commands.queries.execute_tests_query import TestExecutionDef, TestExecutionSQL +from testgen.common import execute_db_queries, fetch_dict_from_db +from testgen.common.database.database_service import write_to_app_db + +LOG = logging.getLogger("testgen") + + +def run_test_validation(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> list[TestExecutionDef]: + test_defs_by_id: dict[UUID, TestExecutionDef] = {td.id: td for td in test_defs} + identifiers_to_check: dict[tuple[str, str, str | None], set[UUID]] = {} + target_schemas = set() + quote = sql_generator.flavor_service.quote_character + + def add_identifiers(test_id: UUID, schema: str, table: str, columns: str | None = None, single_column: bool = False) -> None: + target_schemas.add(schema) + if columns: + if single_column: + identifiers = [(schema.lower(), table.lower(), columns.strip(f" {quote}").lower())] + else: + column_names = re.split(rf",(?=(?:[^\{quote}]*\{quote}[^\{quote}]*\{quote})*[^\{quote}]*$)", columns) + column_names = [col.strip(f" {quote}") for col in column_names] + identifiers = [(schema.lower(), table.lower(), col.lower()) for col in column_names if col] + else: + identifiers = [(schema.lower(), table.lower(), None)] + + for key in identifiers: + if not identifiers_to_check.get(key): + identifiers_to_check[key] = set() + identifiers_to_check[key].add(test_id) + + def add_test_error(test_ids: list[UUID], error: str) -> None: + for test_id in test_ids: + if not test_defs_by_id[test_id].errors: + test_defs_by_id[test_id].errors.append("Deactivated") + test_defs_by_id[test_id].errors.append(error) + + for td in test_defs: + # No validation needed for custom query or table group tests + if td.test_type == "CUSTOM" or td.test_scope == "tablegroup": + continue + + if td.schema_name and td.table_name and (td.column_name or td.test_scope in ["table", "custom"]): + if td.test_scope in ["table", "custom"] or td.test_type.startswith("Aggregate_"): + # Validate only table for these test types - column is meaningless or uses aggregation functions + add_identifiers(td.id, td.schema_name, td.table_name) + else: + add_identifiers(td.id, td.schema_name, td.table_name, td.column_name, single_column=td.test_scope == "column") + + if td.groupby_names: + add_identifiers(td.id, td.schema_name, td.table_name, td.groupby_names) + + if td.test_scope == "referential": + if td.window_date_column: + add_identifiers(td.id, td.schema_name, td.table_name, td.window_date_column) + + if td.match_column_names or td.match_groupby_names: + if td.match_schema_name and td.match_table_name: + if td.match_column_names and not td.test_type.startswith("Aggregate_"): + add_identifiers(td.id, td.match_schema_name, td.match_table_name, td.match_column_names) + if td.match_groupby_names: + add_identifiers(td.id, td.match_schema_name, td.match_table_name, td.match_groupby_names) + else: + add_test_error([td.id], "Invalid test: match schema, table, or column not defined") + else: + add_test_error([td.id], "Invalid test: schema, table, or column not defined") + + if target_schemas: + LOG.info("Getting tables and columns in target schemas for validation") + target_identifiers = fetch_dict_from_db( + *sql_generator.get_target_identifiers(target_schemas), + use_target_db=True, + ) + if not target_identifiers: + LOG.info("No tables or columns present in target schemas") + + # Normalize identifiers before validating + target_tables = {(item["schema_name"].lower(), item["table_name"].lower()) for item in target_identifiers} + target_columns = { + (item["schema_name"].lower(), item["table_name"].lower(), item["column_name"].lower()) + for item in target_identifiers + } + + for identifier, test_ids in identifiers_to_check.items(): + table = (identifier[0], identifier[1]) + if table not in target_tables: + add_test_error(test_ids, f"Missing table: {'.'.join(table)}") + elif identifier[2] and identifier not in target_columns: + add_test_error(test_ids, f"Missing column: {'.'.join(identifier)}") + + error_results = sql_generator.get_test_errors(test_defs_by_id.values()) + if error_results: + LOG.warning(f"Tests in test suite failed validation: {len(error_results)}") + LOG.info("Writing test validation errors to test results") + write_to_app_db(error_results, sql_generator.result_columns, sql_generator.test_results_table) + + LOG.info("Disabling tests in test suite that failed validation") + execute_db_queries([sql_generator.disable_invalid_test_definitions()]) + else: + LOG.info("No tests in test suite failed validation") + + return [td for td in test_defs if not td.errors] diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index e144f07c..95ec4bc0 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -96,7 +96,6 @@ def _refresh_static_metadata(params_mapping): user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) import_metadata_records_from_yaml(params_mapping) @@ -107,7 +106,6 @@ def _refresh_static_metadata(params_mapping): user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 27299615..8d1856e3 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -17,6 +17,9 @@ def CleanSQL(strInput: str) -> str: def quote_identifiers(identifiers: str, flavor: str) -> str: + if not identifiers: + return "" + # Keywords -- identifiers to quote keywords = [ "select", @@ -47,15 +50,13 @@ def quote_identifiers(identifiers: str, flavor: str) -> str: return ", ".join(quoted_values) -def ConcatColumnList(str_column_list, str_null_value): - # Prepares SQL expression to concatenate comma-separated column list into single SQL expression - str_expression = "" - if str_column_list: - if "," in str_column_list: - # Split each comma separated column name into individual list items - cols = [s.strip() for s in str_column_list.split(",")] - str_each = [f"COALESCE({i}, '{str_null_value}')" for i in cols] - str_expression = "CONCAT(" + ", ".join(str_each) + ")" +def concat_columns(columns: str, null_value: str): + # Prepares SQL expression to concatenate comma-separated column list + expression = "" + if columns: + if "," in columns: + column_list = [f"COALESCE({col.strip()}, '{null_value}')" for col in columns.split(",")] + expression = f"CONCAT({', '.join(column_list)})" else: - str_expression = str_column_list - return str_expression + expression = columns + return expression diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 8adbe7cf..cc712403 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -2,15 +2,13 @@ import csv import importlib import logging -import queue as qu -import threading +from collections.abc import Callable, Iterable from contextlib import suppress from dataclasses import dataclass, field from typing import Any, Literal, TypedDict from urllib.parse import quote_plus import psycopg2.sql -from progress.spinner import Spinner from sqlalchemy import create_engine, text from sqlalchemy.engine import LegacyRow, RowMapping from sqlalchemy.engine.base import Connection, Engine @@ -29,6 +27,7 @@ from testgen.common.database import FilteredStringIO from testgen.common.database.flavor.flavor_service import ConnectionParams, FlavorService, SQLFlavor from testgen.common.read_file import get_template_files +from testgen.utils import get_exception_message LOG = logging.getLogger("testgen") @@ -95,7 +94,7 @@ def create_database( drop_existing: bool = False, drop_users_and_roles: bool = False, ) -> None: - LOG.info("DB operation: create_database on App database (User type = database_admin)") + LOG.debug("DB operation: create_database on App database (User type = database_admin)") connection = _init_db_connection( user_override=params["TESTGEN_ADMIN_USER"], @@ -134,19 +133,16 @@ def execute_db_queries( user_override: str | None = None, password_override: str | None = None, user_type: UserType = "normal", - suppress_logs: bool = False, ) -> tuple[list[Any], list[int]]: - LOG.info(f"DB operation: execute_db_queries ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = {user_type})") + LOG.debug(f"DB operation: execute_db_queries ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = {user_type})") with _init_db_connection(use_target_db, user_override, password_override, user_type) as connection: return_values: list[Any] = [] row_counts: list[int] = [] if not queries: - LOG.info("No queries to process") + LOG.debug("No queries to process") for index, (query, params) in enumerate(queries): - LOG.debug(f"Query: {query}") - if not suppress_logs: - LOG.info(f"Processing {index + 1} of {len(queries)} queries") + LOG.debug(f"Query {index + 1} of {len(queries)}: {query}") transaction = connection.begin() result = connection.execute(text(query), params) row_counts.append(result.rowcount) @@ -166,55 +162,80 @@ def execute_db_queries( return return_values, row_counts +class ThreadedProgress(TypedDict): + processed: int + errors: int + total: int + indexes: list[int] + def fetch_from_db_threaded( queries: list[tuple[str, dict | None]], use_target_db: bool = False, - max_threads: int | None = None, - spinner: Spinner | None = None, -) -> tuple[list[LegacyRow], list[str], int]: - LOG.info(f"DB operation: fetch_from_db_threaded on {'Target' if use_target_db else 'App'} database (User type = normal)") + max_threads: int = 4, + progress_callback: Callable[[ThreadedProgress], None] | None = None, +) -> tuple[list[LegacyRow], list[str], dict[int, str]]: + LOG.debug(f"DB operation: fetch_from_db_threaded ({len(queries)}) on {'Target' if use_target_db else 'App'} database (User type = normal)") - result_data = [] - result_columns: list[str] = [] - error_count = 0 + def fetch_data(query: str, params: dict | None, index: int) -> tuple[list[LegacyRow], list[str], int, str | None]: + LOG.debug(f"Query: {query}") + row_data: list[LegacyRow] = [] + column_names: list[str] = [] + error = None - if not max_threads or max_threads < 1 or max_threads > 10: - max_threads = 4 + try: + with _init_db_connection(use_target_db) as connection: + result = connection.execute(text(query), params) + LOG.debug(f"{result.rowcount} records retrieved") + row_data = result.fetchall() + column_names = list(result.keys()) + except Exception as e: + error = get_exception_message(e) + LOG.exception(f"Failed to execute threaded query: {query}") - queue = qu.Queue() - for item in queries: - queue.put(item) + return row_data, column_names, index, error + + result_data: list[LegacyRow] = [] + result_columns: list[str] = [] + error_data: dict[int, str] = {} - threaded_fetch = _ThreadedFetch(use_target_db, threading.Lock()) + query_count = len(queries) + processed_count = 0 + processed_indexes: list[int] = [] + max_threads = max(1, min(10, max_threads)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: - try: - futures = [] - while not queue.empty(): - query, params = queue.get() - futures.append(executor.submit(threaded_fetch, query, params)) - - for future in futures: - row_data, column_names, has_errors = future.result() - if spinner: - spinner.next() - error_count += 1 if has_errors else 0 - if row_data: - result_data.append(row_data) - result_columns = column_names - - except Exception: - LOG.exception("Failed to execute threaded queries") + futures = [ + executor.submit(fetch_data, query, params, index) + for index, (query, params) in enumerate(queries) + ] + for future in concurrent.futures.as_completed(futures): + row_data, column_names, index, error = future.result() + if row_data: + result_data.append(row_data) + result_columns = column_names + if error: + error_data[index] = error + + processed_count += 1 + processed_indexes.append(index) + if progress_callback: + progress_callback({ + "processed": processed_count, + "errors": len(error_data), + "total": query_count, + "indexes": processed_indexes, + }) + LOG.debug(f"Processed {processed_count} of {query_count} threaded queries") # Flatten nested lists result_data = [element for sublist in result_data for element in sublist] - return result_data, result_columns, error_count + return result_data, result_columns, error_data def fetch_list_from_db( query: str, params: dict | None = None, use_target_db: bool = False ) -> tuple[list[LegacyRow], list[str]]: - LOG.info(f"DB operation: fetch_list_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") + LOG.debug(f"DB operation: fetch_list_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") with _init_db_connection(use_target_db) as connection: LOG.debug(f"Query: {query}") @@ -229,7 +250,7 @@ def fetch_list_from_db( def fetch_dict_from_db( query: str, params: dict | None = None, use_target_db: bool = False ) -> list[RowMapping]: - LOG.info(f"DB operation: fetch_dict_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") + LOG.debug(f"DB operation: fetch_dict_from_db on {'Target' if use_target_db else 'App'} database (User type = normal)") with _init_db_connection(use_target_db) as connection: LOG.debug(f"Query: {query}") @@ -239,8 +260,8 @@ def fetch_dict_from_db( return [row._mapping for row in result] -def write_to_app_db(data: list[LegacyRow], column_names: list[str], table_name: str) -> None: - LOG.info("DB operation: write_to_app_db on App database (User type = normal)") +def write_to_app_db(data: list[LegacyRow], column_names: Iterable[str], table_name: str) -> None: + LOG.debug("DB operation: write_to_app_db on App database (User type = normal)") # use_raw is required to make use of the copy_expert method for fast batch ingestion connection = _init_db_connection(use_raw=True) @@ -384,37 +405,3 @@ def _init_target_db_connection() -> Connection: ) return connection - - -class _ThreadedFetch: - def __init__(self, use_target_db: bool, count_lock: threading.Lock): - self.use_target_db = use_target_db - self.count_lock = count_lock - self.count = 0 - - def __call__(self, query: str, params: dict | None = None) -> tuple[list[LegacyRow], list[str], bool]: - LOG.debug(f"Query: {query}") - column_names: list[str] = [] - row_data: list = None - has_errors = False - - with self.count_lock: - self.count += 1 - i = self.count - - try: - with _init_db_connection(self.use_target_db) as connection: - try: - result = connection.execute(text(query), params) - LOG.debug(f"{result.rowcount} records retrieved") - row_data = result.fetchall() - if not column_names: - column_names = result.keys() - LOG.info(f"Processed threaded query {i} on thread {threading.current_thread().name}") - except Exception: - LOG.exception(f"Failed to execute threaded query: {query}") - has_errors = True - except Exception as e: - raise ValueError(f"Failed to execute threaded query: {e}") from e - else: - return row_data, list(column_names), has_errors diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index 9849f1bb..bb253595 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -22,6 +22,8 @@ class ConnectionParams(TypedDict): private_key_passphrase: bytes http_path: str service_account_key: dict[str,Any] + connect_with_identity: bool + sql_flavor_code: str class FlavorService: @@ -49,6 +51,8 @@ def init(self, connection_params: ConnectionParams): self.catalog = connection_params.get("catalog") or "" self.warehouse = connection_params.get("warehouse") or "" self.service_account_key = connection_params.get("service_account_key", None) + self.connect_with_identity = connection_params.get("connect_with_identity") or False + self.sql_flavor_code = connection_params.get("sql_flavor_code") or self.flavor password = connection_params.get("project_pw_encrypted", None) if isinstance(password, memoryview) or isinstance(password, bytes): diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index f4e3f1be..088c11e9 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -1,5 +1,7 @@ from urllib.parse import quote_plus +from sqlalchemy.engine import URL + from testgen import settings from testgen.common.database.flavor.flavor_service import FlavorService @@ -14,14 +16,28 @@ def get_connection_string_head(self): return f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@" def get_connection_string_from_fields(self): - strConnect = ( - f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}?driver=ODBC+Driver+18+for+SQL+Server" + connection_url = URL.create( + "mssql+pyodbc", + username=self.username, + password=quote_plus(self.password or ""), + host=self.host, + port=int(self.port or 1443), + database=self.dbname, + query={ + "driver": "ODBC Driver 18 for SQL Server", + }, ) - if "synapse" in self.host: - strConnect += "&autocommit=True" + if self.connect_with_identity: + connection_url = connection_url._replace(username=None, password=None).update_query_dict({ + "encrypt": "yes", + "authentication": "ActiveDirectoryMsi", + }) + + if self.sql_flavor_code == "synapse_mssql": + connection_url = connection_url.update_query_dict({"autocommit": "True"}) - return strConnect + return connection_url.render_as_string(hide_password=False) def get_pre_connection_queries(self): return [ diff --git a/testgen/common/date_service.py b/testgen/common/date_service.py index 41e34125..000f0652 100644 --- a/testgen/common/date_service.py +++ b/testgen/common/date_service.py @@ -1,28 +1,10 @@ -from datetime import UTC, datetime, timedelta +from datetime import UTC, datetime import pandas as pd -def get_today_as_string(): - return datetime.utcnow().strftime("%Y-%m-%d") - - -def get_now_as_string(): - return datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - - -def parse_now(value: str) -> datetime: - return datetime.strptime(value, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC) - - -def get_now_as_string_with_offset(minutes_offset): - ret = datetime.utcnow() - ret = ret + timedelta(minutes=minutes_offset) - return ret.strftime("%Y-%m-%d %H:%M:%S") - - def get_now_as_iso_timestamp(): - return as_iso_timestamp(datetime.utcnow()) + return as_iso_timestamp(datetime.now(UTC)) def as_iso_timestamp(date: datetime) -> str | None: @@ -47,14 +29,6 @@ def accommodate_dataframe_to_timezone(df, streamlit_session, time_columns=None): df[time_column] = df[time_column].dt.strftime("%Y-%m-%d %H:%M:%S") -def create_timezoned_column_in_dataframe(streamlit_session, df, new_column_name, existing_column_name): - if new_column_name and existing_column_name and "browser_timezone" in streamlit_session: - timezone = streamlit_session["browser_timezone"] - df[new_column_name] = ( - df[existing_column_name].dt.tz_localize("UTC").dt.tz_convert(timezone).dt.strftime("%Y-%m-%d %H:%M:%S") - ) - - def get_timezoned_timestamp(streamlit_session, value, dateformat="%b %-d, %-I:%M %p"): ret = None if value and "browser_timezone" in streamlit_session: @@ -64,25 +38,3 @@ def get_timezoned_timestamp(streamlit_session, value, dateformat="%b %-d, %-I:%M df["value"] = df["value"].dt.tz_localize("UTC").dt.tz_convert(timezone).dt.strftime(dateformat) ret = df.iloc[0, 0] return ret - - -def get_timezoned_now(streamlit_session): - value = datetime.utcnow() - return get_timezoned_timestamp(streamlit_session, value) - - -def get_formatted_duration(duration: str | None) -> str: - if not duration: - return "--" - - hour, minute, second = duration.split(":") - formatted = "" - if int(hour): - formatted += f"{int(hour)!s}h" - if int(minute): - formatted += f" {int(minute)!s}m" - if int(second): - formatted += f" {int(second)!s}s" - - formatted = formatted.strip() or "< 1s" - return formatted diff --git a/testgen/common/email.py b/testgen/common/email.py new file mode 100644 index 00000000..1579ce05 --- /dev/null +++ b/testgen/common/email.py @@ -0,0 +1,90 @@ +import logging +import smtplib +import ssl +from collections.abc import Mapping +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +from pybars import Compiler + +from testgen import settings + +LOG = logging.getLogger(__name__) + +MANDATORY_SETTINGS = ( + "EMAIL_FROM_ADDRESS", + "SMTP_ENDPOINT", + "SMTP_PORT", + "SMTP_USERNAME", + "SMTP_PASSWORD", +) + + +class EmailTemplateException(Exception): + pass + + +class BaseEmailTemplate: + + def __init__(self): + compiler = Compiler() + self.compiled_subject = compiler.compile(self.get_subject_template()) + self.compiled_body = compiler.compile(self.get_body_template()) + + def validate_settings(self): + missing_settings = [ + f"TG_{setting_name}" + for setting_name in MANDATORY_SETTINGS + if getattr(settings, setting_name) is None + ] + + if missing_settings: + LOG.error( + "Template '%s' can not send emails because the following settings are missing: %s", + self.__class__.__name__, + ", ".join(missing_settings), + ) + + raise EmailTemplateException("Invalid or insufficient email/SMTP settings") + + def get_subject_template(self) -> str: + raise NotImplementedError + + def get_body_template(self) -> str: + raise NotImplementedError + + def get_message(self, recipients: list[str], context: Mapping | None) -> MIMEMultipart: + subject = self.compiled_subject(context) + body = self.compiled_body(context) + + message = MIMEMultipart("alternative") + message["Subject"] = subject + message["To"] = ", ".join(recipients) + message["From"] = settings.EMAIL_FROM_ADDRESS + message.attach(MIMEText(body, "html")) + return message + + def send_mime_message(self, recipients: list[str], message: MIMEMultipart) -> dict: + ssl_context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + try: + with smtplib.SMTP_SSL(settings.SMTP_ENDPOINT, settings.SMTP_PORT, context=ssl_context) as smtp_server: + smtp_server.login(settings.SMTP_USERNAME, settings.SMTP_PASSWORD) + response = smtp_server.sendmail(settings.EMAIL_FROM_ADDRESS, recipients, message.as_string()) + except Exception as e: + LOG.error("Template '%s' failed to send email with: %s", self.__class__.__name__, e) # noqa: TRY400 + else: + return response + + def send(self, recipients: list[str], context: Mapping | None) -> dict: + self.validate_settings() + mime_message = self.get_message(recipients, context) + response = self.send_mime_message(recipients, mime_message) + + LOG.info( + "Template '%s' successfully sent email to %d recipients -- %d failed.", + self.__class__.__name__, + len(recipients) - len(response), + len(response) + ) + + return response diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index 3c37aacf..1f10b364 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -8,52 +8,12 @@ class BaseParams(TypedDict): project_code: str connection_id: str -class ProfilingParams(BaseParams): - table_groups_id: str - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - profile_id_column_mask: str - profile_sk_column_mask: str - profile_use_sampling: str - profile_flag_cdes: bool - profile_sample_percent: str - profile_sample_min_count: int - profile_do_pair_rules: str - profile_pair_rule_pct: int - monitor_test_suite_id: str | None - last_complete_profile_run_id: str | None - - class TestGenerationParams(BaseParams): export_to_observability: str test_suite_id: str profiling_as_of_date: str -class TestExecutionParams(BaseParams): - test_suite_id: str - table_groups_id: str - table_group_schema: str - profiling_table_set: str - profiling_include_mask: str - profiling_exclude_mask: str - sql_flavor: str - max_threads: int - max_query_chars: int - - - -def get_profiling_params(table_group_id: str) -> ProfilingParams: - results = fetch_dict_from_db( - read_template_sql_file("parms_profiling.sql", "parms"), - {"TABLE_GROUP_ID": table_group_id}, - ) - if not results: - raise ValueError("Connection parameters not found for profiling.") - return ProfilingParams(results[0]) - - def get_test_generation_params(table_group_id: str, test_suite: str) -> TestGenerationParams: results = fetch_dict_from_db( read_template_sql_file("parms_test_gen.sql", "parms"), @@ -62,13 +22,3 @@ def get_test_generation_params(table_group_id: str, test_suite: str) -> TestGene if not results: raise ValueError("Connection parameters not found for test generation.") return TestGenerationParams(results[0]) - - -def get_test_execution_params(project_code: str, test_suite: str) -> TestExecutionParams: - results = fetch_dict_from_db( - read_template_sql_file("parms_test_execution.sql", "parms"), - {"PROJECT_CODE": project_code, "TEST_SUITE": test_suite} - ) - if not results: - raise ValueError("Connection parameters not found for test execution.") - return TestExecutionParams(results[0]) diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 84f71aa5..1b5a96f5 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -62,6 +62,7 @@ class Connection(Entity): http_path: str = Column(String) warehouse: str = Column(String) service_account_key: JSON_TYPE = Column(EncryptedJson) + connect_with_identity: bool = Column(Boolean, default=False) _get_by = "connection_id" _default_order_by = (asc(func.lower(connection_name)),) diff --git a/testgen/common/models/entity.py b/testgen/common/models/entity.py index 8545b3da..2fe3ac93 100644 --- a/testgen/common/models/entity.py +++ b/testgen/common/models/entity.py @@ -138,6 +138,10 @@ def clear_cache(cls) -> None: @classmethod def columns(cls) -> list[str]: return list(cls.__annotations__.keys()) + + def refresh(self) -> None: + db_session = get_current_session() + db_session.refresh(self) def save(self) -> None: is_new = self.id is None diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index da848f76..0343b99f 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -1,13 +1,14 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import UTC, datetime -from typing import Literal, NamedTuple -from uuid import UUID +from typing import Literal, NamedTuple, TypedDict +from uuid import UUID, uuid4 import streamlit as st from sqlalchemy import BigInteger, Column, Float, Integer, String, desc, func, select, text, update from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute +from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.sql.expression import case from testgen.common.models import get_current_session @@ -16,7 +17,15 @@ from testgen.utils import is_uuid4 ProfilingRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] +ProgressKey = Literal["data_chars", "col_profiling", "freq_analysis", "hygiene_issues"] +ProgressStatus = Literal["Pending", "Running", "Completed", "Warning"] +class ProgressStep(TypedDict): + key: ProgressKey + status: ProgressStatus + label: str + detail: str + error: str @dataclass class ProfilingRunMinimal(EntityMinimal): @@ -32,16 +41,19 @@ class ProfilingRunMinimal(EntityMinimal): @dataclass class ProfilingRunSummary(EntityMinimal): - profiling_run_id: UUID - start_time: datetime - end_time: datetime + id: UUID + profiling_starttime: datetime + profiling_endtime: datetime table_groups_name: str status: ProfilingRunStatus + progress: list[ProgressStep] process_id: int log_message: str - schema_name: str + table_group_schema: str table_ct: int column_ct: int + record_ct: int + data_point_ct: int anomaly_ct: int anomalies_definite_ct: int anomalies_likely_ct: int @@ -58,16 +70,19 @@ class LatestProfilingRun(NamedTuple): class ProfilingRun(Entity): __tablename__ = "profiling_runs" - id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True) + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4) project_code: str = Column(String, nullable=False) connection_id: str = Column(BigInteger, nullable=False) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), nullable=False) profiling_starttime: datetime = Column(postgresql.TIMESTAMP) profiling_endtime: datetime = Column(postgresql.TIMESTAMP) status: ProfilingRunStatus = Column(String, default="Running") + progress: list[ProgressStep] = Column(postgresql.JSONB, default=[]) log_message: str = Column(String) table_ct: int = Column(BigInteger) column_ct: int = Column(BigInteger) + record_ct: int = Column(BigInteger) + data_point_ct: int = Column(BigInteger) anomaly_ct: int = Column(BigInteger) anomaly_table_ct: int = Column(BigInteger) anomaly_column_ct: int = Column(BigInteger) @@ -176,28 +191,32 @@ def select_summary( ) GROUP BY profile_anomaly_results.profile_run_id ) - SELECT v_profiling_runs.profiling_run_id, - v_profiling_runs.start_time, - v_profiling_runs.end_time, - v_profiling_runs.table_groups_name, - v_profiling_runs.status, - v_profiling_runs.process_id, - v_profiling_runs.log_message, - v_profiling_runs.schema_name, - v_profiling_runs.table_ct, - v_profiling_runs.column_ct, - v_profiling_runs.anomaly_ct, + SELECT profiling_runs.id, + profiling_runs.profiling_starttime, + profiling_runs.profiling_endtime, + table_groups.table_groups_name, + profiling_runs.status, + profiling_runs.progress, + profiling_runs.process_id, + profiling_runs.log_message, + table_groups.table_group_schema, + profiling_runs.table_ct, + profiling_runs.column_ct, + profiling_runs.record_ct, + profiling_runs.data_point_ct, + profiling_runs.anomaly_ct, profile_anomalies.definite_ct AS anomalies_definite_ct, profile_anomalies.likely_ct AS anomalies_likely_ct, profile_anomalies.possible_ct AS anomalies_possible_ct, profile_anomalies.dismissed_ct AS anomalies_dismissed_ct, - v_profiling_runs.dq_score_profiling - FROM v_profiling_runs - LEFT JOIN profile_anomalies ON (v_profiling_runs.profiling_run_id = profile_anomalies.profile_run_id) - WHERE project_code = :project_code - {"AND v_profiling_runs.table_groups_id = :table_group_id" if table_group_id else ""} - {"AND v_profiling_runs.profiling_run_id IN :profiling_run_ids" if profiling_run_ids else ""} - ORDER BY start_time DESC; + profiling_runs.dq_score_profiling + FROM profiling_runs + LEFT JOIN table_groups ON (profiling_runs.table_groups_id = table_groups.id) + LEFT JOIN profile_anomalies ON (profiling_runs.id = profile_anomalies.profile_run_id) + WHERE profiling_runs.project_code = :project_code + {"AND profiling_runs.table_groups_id = :table_group_id" if table_group_id else ""} + {"AND profiling_runs.id IN :profiling_run_ids" if profiling_run_ids else ""} + ORDER BY profiling_starttime DESC; """ params = { "project_code": project_code, @@ -225,8 +244,8 @@ def cancel_all_running(cls) -> None: cls.clear_cache() @classmethod - def update_status(cls, run_id: str | UUID, status: ProfilingRunStatus) -> None: - query = update(cls).where(cls.id == run_id).values(status=status) + def cancel_run(cls, run_id: str | UUID) -> None: + query = update(cls).where(cls.id == run_id).values(status="Cancelled", profiling_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) db_session.commit() @@ -256,5 +275,22 @@ def clear_cache(cls) -> bool: cls.select_minimal_where.clear() cls.select_summary.clear() - def save(self) -> None: - raise NotImplementedError + def init_progress(self) -> None: + self._progress = { + "data_chars": {"label": "Refreshing data catalog"}, + "col_profiling": {"label": "Profiling columns"}, + "freq_analysis": {"label": "Running frequency analysis"}, + "hygiene_issues": {"label": "Detecting hygiene issues"}, + } + for key in self._progress: + self._progress[key].update({"key": key, "status": "Pending"}) + + def set_progress(self, key: ProgressKey, status: ProgressStatus, detail: str | None = None, error: str | None = None) -> None: + self._progress[key]["status"] = status + if detail: + self._progress[key]["detail"] = detail + if error: + self._progress[key]["error"] = error + + self.progress = list(self._progress.values()) + flag_modified(self, "progress") diff --git a/testgen/common/models/scheduler.py b/testgen/common/models/scheduler.py index 3cd9cb79..da3f7073 100644 --- a/testgen/common/models/scheduler.py +++ b/testgen/common/models/scheduler.py @@ -4,7 +4,7 @@ from uuid import UUID, uuid4 from cron_converter import Cron -from sqlalchemy import Boolean, Column, String, delete, func, select, update +from sqlalchemy import Boolean, Column, String, cast, delete, func, select, update from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute @@ -33,10 +33,10 @@ class JobSchedule(Base): def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = None) -> Iterable[Self]: test_definitions_count = ( select(cls.id) - .join(TestSuite, TestSuite.test_suite == cls.kwargs["test_suite_key"].astext) + .join(TestSuite, TestSuite.id == cast(cls.kwargs["test_suite_id"].astext, postgresql.UUID)) .join(TestDefinition, TestDefinition.test_suite_id == TestSuite.id) .where(cls.key == RUN_TESTS_JOB_KEY, cls.active == True) - .group_by(cls.id, TestSuite.test_suite) + .group_by(cls.id, TestSuite.id) .having(func.count(TestDefinition.id) > 0) .subquery() ) diff --git a/testgen/common/models/scores.py b/testgen/common/models/scores.py index c6db830b..61c3ceb4 100644 --- a/testgen/common/models/scores.py +++ b/testgen/common/models/scores.py @@ -12,9 +12,9 @@ from typing import Literal, Self, TypedDict from uuid import UUID, uuid4 -from sqlalchemy import Boolean, Column, DateTime, Enum, Float, ForeignKey, Integer, String, select, text +from sqlalchemy import Boolean, Column, DateTime, Enum, Float, ForeignKey, Integer, String, delete, func, select, text from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import relationship +from sqlalchemy.orm import aliased, attributes, relationship from testgen.common import read_template_sql_file from testgen.common.models import Base, get_current_session @@ -79,7 +79,7 @@ class ScoreDefinition(Base): criteria: ScoreDefinitionCriteria = relationship( "ScoreDefinitionCriteria", cascade="all, delete-orphan", - lazy="joined", + lazy="select", uselist=False, single_parent=True, ) @@ -93,7 +93,7 @@ class ScoreDefinition(Base): "ScoreDefinitionBreakdownItem", cascade="all, delete-orphan", order_by="ScoreDefinitionBreakdownItem.impact.desc()", - lazy="joined", + lazy="select", ) history: Iterable[ScoreDefinitionResultHistoryEntry] = relationship( "ScoreDefinitionResultHistoryEntry", @@ -136,16 +136,50 @@ def all( project_code: str | None = None, name_filter: str | None = None, sorted_by: str | None = "name", + last_history_items: int = 0, ) -> Iterable[Self]: definitions = [] db_session = get_current_session() - query = select(ScoreDefinition) + query = select(ScoreDefinition).options() if name_filter: query = query.where(ScoreDefinition.name.ilike(f"%{name_filter}%")) if project_code: query = query.where(ScoreDefinition.project_code == project_code) + query = query.order_by(text(sorted_by)) definitions = db_session.scalars(query).unique().all() + definitions_map = {} + + if last_history_items > 0: + for definition in definitions: + definitions_map[str(definition.id)] = definition + db_session.expunge(definition) + attributes.set_committed_value(definition, "history", []) + + HistoryEntry = aliased(ScoreDefinitionResultHistoryEntry) + history_subquery = select( + HistoryEntry.definition_id, + HistoryEntry.category, + HistoryEntry.score, + HistoryEntry.last_run_time, + func.row_number().over( + partition_by=HistoryEntry.definition_id, + order_by=HistoryEntry.last_run_time.desc(), + ) + .label("rn"), + ).subquery() + history_query = select(history_subquery).where(history_subquery.c.rn <= last_history_items) + + history_entries = db_session.execute(history_query).unique().all() + for entry in history_entries: + if (definition := definitions_map.get(str(entry.definition_id))): + definition.history.append(ScoreDefinitionResultHistoryEntry( + definition_id=entry.definition_id, + category=entry.category, + score=entry.score, + last_run_time=entry.last_run_time, + )) + return definitions def save(self) -> None: @@ -161,6 +195,23 @@ def delete(self) -> None: db_session.delete(self) db_session.commit() + def clear_results(self) -> None: + db_session = get_current_session() + + delete_results_query = delete(ScoreDefinitionResult).where( + ScoreDefinitionResult.definition_id == self.id + ) + delete_breakdown_query = delete(ScoreDefinitionBreakdownItem).where( + ScoreDefinitionBreakdownItem.definition_id == self.id + ) + + db_session.execute(delete_results_query) + db_session.execute(delete_breakdown_query) + db_session.flush() + + self.results = [] + self.breakdown = [] + def as_score_card(self) -> ScoreCard: """ Executes and combines two raw queries to build a fresh score @@ -223,7 +274,7 @@ def as_score_card(self) -> ScoreCard: "definition": self, } - def as_cached_score_card(self) -> ScoreCard: + def as_cached_score_card(self, include_definition: bool = False) -> ScoreCard: """Reads the cached values to build a scorecard""" root_keys: list[str] = ["score", "profiling_score", "testing_score", "cde_score"] score_card: ScoreCard = { @@ -232,7 +283,7 @@ def as_cached_score_card(self) -> ScoreCard: "name": self.name, "categories": [], "history": [], - "definition": self, + "definition": self if include_definition else None, } for result in sorted(self.results, key=lambda r: r.category): diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 46e3da53..ae24af04 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -30,17 +30,33 @@ class TableGroupMinimal(EntityMinimal): profiling_delay_days: str +@dataclass +class TableGroupStats(EntityMinimal): + id: UUID + table_groups_name: str + table_group_schema: str + table_ct: int + column_ct: int + approx_record_ct: int + record_ct: int + approx_data_point_ct: int + data_point_ct: int + + @dataclass class TableGroupSummary(EntityMinimal): id: UUID table_groups_name: str + table_ct: int + column_ct: int + approx_record_ct: int + record_ct: int + approx_data_point_ct: int + data_point_ct: int dq_score_profiling: float dq_score_testing: float latest_profile_id: UUID latest_profile_start: datetime - latest_profile_table_ct: int - latest_profile_column_ct: int - latest_profile_data_point_ct: int latest_anomalies_ct: int latest_anomalies_definite_ct: int latest_anomalies_likely_ct: int @@ -113,18 +129,61 @@ def select_minimal_where( ) -> Iterable[TableGroupMinimal]: results = cls._select_columns_where(cls._minimal_columns, *clauses, order_by=order_by) return [TableGroupMinimal(**row) for row in results] + + @classmethod + @st.cache_data(show_spinner=False) + def select_stats(cls, project_code: str, table_group_id: str | UUID | None = None) -> Iterable[TableGroupStats]: + query = f""" + WITH stats AS ( + SELECT table_groups_id, + COUNT(*) AS table_ct, + SUM(column_ct) AS column_ct, + SUM(approx_record_ct) AS approx_record_ct, + SUM(record_ct) AS record_ct, + SUM(column_ct * approx_record_ct) AS approx_data_point_ct, + SUM(column_ct * record_ct) AS data_point_ct + FROM data_table_chars + GROUP BY table_groups_id + ) + SELECT groups.id, + groups.table_groups_name, + groups.table_group_schema, + stats.table_ct, + stats.column_ct, + stats.approx_record_ct, + stats.record_ct, + stats.approx_data_point_ct, + stats.data_point_ct + FROM table_groups AS groups + LEFT JOIN stats ON (groups.id = stats.table_groups_id) + WHERE groups.project_code = :project_code + {"AND groups.id = :table_group_id" if table_group_id else ""} + ORDER BY LOWER(groups.table_groups_name); + """ + params = {"project_code": project_code, "table_group_id": table_group_id} + db_session = get_current_session() + results = db_session.execute(text(query), params).mappings().all() + return [TableGroupStats(**row) for row in results] @classmethod @st.cache_data(show_spinner=False) def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Iterable[TableGroupSummary]: query = f""" - WITH latest_profile AS ( + WITH stats AS ( + SELECT table_groups_id, + COUNT(*) AS table_ct, + SUM(column_ct) AS column_ct, + SUM(approx_record_ct) AS approx_record_ct, + SUM(record_ct) AS record_ct, + SUM(column_ct * approx_record_ct) AS approx_data_point_ct, + SUM(column_ct * record_ct) AS data_point_ct + FROM data_table_chars + GROUP BY table_groups_id + ), + latest_profile AS ( SELECT latest_run.table_groups_id, latest_run.id, latest_run.profiling_starttime, - latest_run.table_ct, - latest_run.column_ct, - latest_run.dq_total_data_points, latest_run.anomaly_ct, SUM( CASE @@ -167,19 +226,23 @@ def select_summary(cls, project_code: str, for_dashboard: bool = False) -> Itera ) SELECT groups.id, groups.table_groups_name, + stats.table_ct, + stats.column_ct, + stats.approx_record_ct, + stats.record_ct, + stats.approx_data_point_ct, + stats.data_point_ct, groups.dq_score_profiling, groups.dq_score_testing, latest_profile.id AS latest_profile_id, latest_profile.profiling_starttime AS latest_profile_start, - latest_profile.table_ct AS latest_profile_table_ct, - latest_profile.column_ct AS latest_profile_column_ct, - latest_profile.dq_total_data_points AS latest_profile_data_point_ct, latest_profile.anomaly_ct AS latest_anomalies_ct, latest_profile.definite_ct AS latest_anomalies_definite_ct, latest_profile.likely_ct AS latest_anomalies_likely_ct, latest_profile.possible_ct AS latest_anomalies_possible_ct, latest_profile.dismissed_ct AS latest_anomalies_dismissed_ct FROM table_groups AS groups + LEFT JOIN stats ON (groups.id = stats.table_groups_id) LEFT JOIN latest_profile ON (groups.id = latest_profile.table_groups_id) WHERE groups.project_code = :project_code {"AND groups.include_in_dashboard IS TRUE" if for_dashboard else ""}; @@ -309,7 +372,7 @@ def save( cron_expr="0 * * * *", cron_tz=monitor_schedule_timezone, args=[], - kwargs={"project_key": self.project_code, "test_suite_key": test_suite.test_suite}, + kwargs={"test_suite_id": test_suite.id}, ) db_session.add(schedule_job) diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index b193dff6..195121b7 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -6,10 +6,8 @@ import streamlit as st from sqlalchemy import ( - BigInteger, Column, ForeignKey, - Identity, String, Text, TypeDecorator, @@ -29,6 +27,8 @@ from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal from testgen.utils import is_uuid4 +TestRunType = Literal["QUERY", "CAT", "METADATA"] +TestScope = Literal["column", "referential", "table", "tablegroup", "custom"] TestRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] @@ -84,7 +84,7 @@ class TestDefinitionSummary(EntityMinimal): default_parm_prompts: str default_parm_help: str default_severity: str - test_scope: str + test_scope: TestScope usage_notes: str @@ -135,8 +135,8 @@ class TestType(Entity): default_parm_prompts: str = Column(Text) default_parm_help: str = Column(Text) default_severity: str = Column(String) - run_type: str = Column(String) - test_scope: str = Column(String) + run_type: TestRunType = Column(String) + test_scope: TestScope = Column(String) dq_dimension: str = Column(String) health_dimension: str = Column(String) threshold_description: str = Column(String) @@ -147,14 +147,12 @@ class TestType(Entity): class TestDefinition(Entity): __tablename__ = "test_definitions" - id: UUID = Column(postgresql.UUID(as_uuid=True), server_default=text("gen_random_uuid()")) - cat_test_id: int = Column(BigInteger, Identity(), primary_key=True) + id: UUID = Column(postgresql.UUID(as_uuid=True), server_default=text("gen_random_uuid()"), primary_key=True) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True)) profile_run_id: UUID = Column(postgresql.UUID(as_uuid=True)) test_type: str = Column(String) test_suite_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("test_suites.id"), nullable=False) test_description: str = Column(NullIfEmptyString) - test_action: str = Column(String) schema_name: str = Column(String) table_name: str = Column(NullIfEmptyString) column_name: str = Column(NullIfEmptyString) @@ -203,12 +201,10 @@ class TestDefinition(Entity): _minimal_columns = TestDefinitionMinimal.__annotations__.keys() _update_exclude_columns = ( id, - cat_test_id, table_groups_id, profile_run_id, test_type, test_suite_id, - test_action, schema_name, test_mode, watch_level, @@ -272,6 +268,7 @@ def set_status_attribute( ) UPDATE test_definitions SET {status_type} = :value + {", test_definition_status = NULL" if status_type == "test_active" and value else ""} FROM test_definitions td INNER JOIN selected ON (td.id = selected.id::UUID) WHERE td.id = test_definitions.id; @@ -331,7 +328,6 @@ def copy( target_table_name: str | None = None, target_column_name: str | None = None, ) -> None: - id_columns = (cls.id, cls.cat_test_id) modified_columns = [cls.table_groups_id, cls.profile_run_id, cls.test_suite_id] select_columns = [ @@ -352,7 +348,7 @@ def copy( select_columns.append(literal(target_column_name).label("column_name")) other_columns = [ - column for column in cls.__table__.columns if column not in modified_columns and column not in id_columns + column for column in cls.__table__.columns if column not in modified_columns and column != cls.id ] select_columns.extend(other_columns) diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index ed1a01fe..4451fcfc 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -1,12 +1,13 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import UTC, datetime -from typing import Literal, NamedTuple -from uuid import UUID +from typing import Literal, NamedTuple, TypedDict +from uuid import UUID, uuid4 import streamlit as st from sqlalchemy import BigInteger, Column, Float, ForeignKey, Integer, String, Text, desc, func, select, text, update from sqlalchemy.dialects import postgresql +from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.sql.expression import case from testgen.common.models import get_current_session @@ -15,6 +16,15 @@ from testgen.utils import is_uuid4 TestRunStatus = Literal["Running", "Complete", "Error", "Cancelled"] +ProgressKey = Literal["data_chars", "validation", "QUERY", "CAT", "METADATA"] +ProgressStatus = Literal["Pending", "Running", "Completed", "Warning"] + +class ProgressStep(TypedDict): + key: ProgressKey + status: ProgressStatus + label: str + detail: str + error: str @dataclass @@ -37,6 +47,7 @@ class TestRunSummary(EntityMinimal): table_groups_name: str test_suite: str status: TestRunStatus + progress: list[ProgressStep] process_id: int log_message: str test_ct: int @@ -57,13 +68,13 @@ class LatestTestRun(NamedTuple): class TestRun(Entity): __tablename__ = "test_runs" - id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, nullable=False) + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, default=uuid4) test_suite_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("test_suites.id"), nullable=False) test_starttime: datetime = Column(postgresql.TIMESTAMP) test_endtime: datetime = Column(postgresql.TIMESTAMP) status: TestRunStatus = Column(String, default="Running") + progress: list[ProgressStep] = Column(postgresql.JSONB, default=[]) log_message: str = Column(Text) - duration: str = Column(String) test_ct: int = Column(Integer) passed_ct: int = Column(Integer) failed_ct: int = Column(Integer) @@ -187,6 +198,7 @@ def select_summary( table_groups.table_groups_name, test_suites.test_suite, test_runs.status, + test_runs.progress, test_runs.process_id, test_runs.log_message, test_runs.test_ct, @@ -233,8 +245,8 @@ def cancel_all_running(cls) -> None: cls.clear_cache() @classmethod - def update_status(cls, run_id: str | UUID, status: TestRunStatus) -> None: - query = update(cls).where(cls.id == run_id).values(status=status) + def cancel_run(cls, run_id: str | UUID) -> None: + query = update(cls).where(cls.id == run_id).values(status="Cancelled", test_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) db_session.commit() @@ -243,12 +255,6 @@ def update_status(cls, run_id: str | UUID, status: TestRunStatus) -> None: @classmethod def cascade_delete(cls, ids: list[str]) -> None: query = """ - DELETE FROM working_agg_cat_results - WHERE test_run_id IN :test_run_ids; - - DELETE FROM working_agg_cat_tests - WHERE test_run_id IN :test_run_ids; - DELETE FROM test_results WHERE test_run_id IN :test_run_ids; """ @@ -263,5 +269,24 @@ def clear_cache(cls) -> bool: cls.get_minimal.clear() cls.select_summary.clear() - def save(self) -> None: - raise NotImplementedError + def init_progress(self) -> None: + self._progress = { + "data_chars": {"label": "Refreshing data catalog"}, + "validation": {"label": "Validating test definitions"}, + "QUERY": {"label": "Running query tests"}, + "CAT": {"label": "Running aggregated tests"}, + # TODO: TURN ON WHEN ADDING METADATA TESTS + # "METADATA": {"label": "Running metadata tests"}, + } + for key in self._progress: + self._progress[key].update({"key": key, "status": "Pending"}) + + def set_progress(self, key: ProgressKey, status: ProgressStatus, detail: str | None = None, error: str | None = None) -> None: + self._progress[key]["status"] = status + if detail: + self._progress[key]["detail"] = detail + if error: + self._progress[key]["error"] = error + + self.progress = list(self._progress.values()) + flag_modified(self, "progress") diff --git a/testgen/common/models/test_suite.py b/testgen/common/models/test_suite.py index 368147a2..ce7d601c 100644 --- a/testgen/common/models/test_suite.py +++ b/testgen/common/models/test_suite.py @@ -56,10 +56,8 @@ class TestSuite(Entity): connection_id: int = Column(BigInteger, ForeignKey("connections.connection_id")) table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True)) test_suite_description: str = Column(NullIfEmptyString) - test_action: str = Column(String) severity: str = Column(NullIfEmptyString) export_to_observability: bool = Column(YNString, default="Y") - test_suite_schema: str = Column(NullIfEmptyString) component_key: str = Column(NullIfEmptyString) component_type: str = Column(NullIfEmptyString) component_name: str = Column(NullIfEmptyString) @@ -216,18 +214,6 @@ def is_in_use(cls, ids: list[str]) -> bool: @classmethod def cascade_delete(cls, ids: list[str]) -> None: query = """ - DELETE FROM working_agg_cat_results - WHERE test_run_id IN ( - SELECT id FROM test_runs - WHERE test_suite_id IN :test_suite_ids - ); - - DELETE FROM working_agg_cat_tests - WHERE test_run_id IN ( - SELECT id FROM test_runs - WHERE test_suite_id IN :test_suite_ids - ); - DELETE FROM test_runs WHERE test_suite_id IN :test_suite_ids; @@ -237,11 +223,8 @@ def cascade_delete(cls, ids: list[str]) -> None: DELETE FROM test_definitions WHERE test_suite_id IN :test_suite_ids; - DELETE FROM job_schedules js - USING test_suites ts - WHERE js.kwargs->>'project_key' = ts.project_code - AND js.kwargs->>'test_suite_key' = ts.test_suite - AND ts.id IN :test_suite_ids; + DELETE FROM job_schedules + WHERE (kwargs->>'test_suite_id')::UUID IN :test_suite_ids; """ db_session = get_current_session() db_session.execute(text(query), {"test_suite_ids": tuple(ids)}) diff --git a/testgen/common/read_yaml_metadata_records.py b/testgen/common/read_yaml_metadata_records.py index 6361b2b7..28f8cf59 100644 --- a/testgen/common/read_yaml_metadata_records.py +++ b/testgen/common/read_yaml_metadata_records.py @@ -164,7 +164,6 @@ def _process_yaml_for_import(params_mapping: dict, data:dict, parent_table:str, user_override=params_mapping["TESTGEN_ADMIN_USER"], password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], user_type="schema_admin", - suppress_logs=True, ) return diff --git a/testgen/settings.py b/testgen/settings.py index 07f044fa..cf71768d 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -374,7 +374,7 @@ OBSERVABILITY_VERIFY_SSL: bool = os.getenv("TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL", "yes").lower() in ["yes", "true"] """ -When False, exporting events to your instance of Observabilty will skip +When False, exporting events to your instance of Observability will skip SSL verification. from env variable: `TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL` @@ -383,7 +383,7 @@ OBSERVABILITY_EXPORT_LIMIT: int = int(os.getenv("TG_OBSERVABILITY_EXPORT_MAX_QTY", "5000")) """ -When exporting to your instance of Observabilty, the maximum number of +When exporting to your instance of Observability, the maximum number of events that will be sent to the events API on a single export. from env variable: `TG_OBSERVABILITY_EXPORT_MAX_QTY` @@ -392,7 +392,7 @@ OBSERVABILITY_DEFAULT_COMPONENT_TYPE: str = os.getenv("OBSERVABILITY_DEFAULT_COMPONENT_TYPE", "dataset") """ -When exporting to your instance of Observabilty, the type of event that +When exporting to your instance of Observability, the type of event that will be sent to the events API. from env variable: `OBSERVABILITY_DEFAULT_COMPONENT_TYPE` @@ -401,7 +401,7 @@ OBSERVABILITY_DEFAULT_COMPONENT_KEY: str = os.getenv("OBSERVABILITY_DEFAULT_COMPONENT_KEY", "default") """ -When exporting to your instance of Observabilty, the key sent to the +When exporting to your instance of Observability, the key sent to the events API to identify the components. from env variable: `OBSERVABILITY_DEFAULT_COMPONENT_KEY` @@ -475,3 +475,28 @@ """ Limit the number of records used to generate the PDF with test results and hygiene issue reports. """ + +EMAIL_FROM_ADDRESS: str | None = os.getenv("TG_EMAIL_FROM_ADDRESS") +""" +Email: Sender address +""" + +SMTP_ENDPOINT: str | None = os.getenv("TG_SMTP_ENDPOINT") +""" +Email: SMTP endpoint +""" + +SMTP_PORT: int | None = int(os.getenv("TG_SMTP_PORT", 0)) or None +""" +Email: SMTP port +""" + +SMTP_USERNAME: str | None = os.getenv("TG_SMTP_USERNAME") +""" +Email: SMTP username +""" + +SMTP_PASSWORD: str | None = os.getenv("TG_SMTP_PASSWORD") +""" +Email: SMTP password +""" diff --git a/testgen/template/profiling/contingency_columns.sql b/testgen/template/contingency/contingency_columns.sql similarity index 100% rename from testgen/template/profiling/contingency_columns.sql rename to testgen/template/contingency/contingency_columns.sql diff --git a/testgen/template/flavors/generic/profiling/contingency_counts.sql b/testgen/template/contingency/contingency_counts.sql similarity index 100% rename from testgen/template/flavors/generic/profiling/contingency_counts.sql rename to testgen/template/contingency/contingency_counts.sql diff --git a/testgen/template/data_chars/data_chars_staging_delete.sql b/testgen/template/data_chars/data_chars_staging_delete.sql index 292d722c..82418a98 100644 --- a/testgen/template/data_chars/data_chars_staging_delete.sql +++ b/testgen/template/data_chars/data_chars_staging_delete.sql @@ -1,4 +1,3 @@ DELETE FROM stg_data_chars_updates -WHERE project_code = :PROJECT_CODE - AND table_groups_id = :TABLE_GROUPS_ID +WHERE table_groups_id = :TABLE_GROUPS_ID AND run_date = :RUN_DATE; diff --git a/testgen/template/data_chars/data_chars_update.sql b/testgen/template/data_chars/data_chars_update.sql index ec16d4e0..448d07cf 100644 --- a/testgen/template/data_chars/data_chars_update.sql +++ b/testgen/template/data_chars/data_chars_update.sql @@ -7,20 +7,19 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name, - functional_table_type, run_date, + MAX(approx_record_ct) AS approx_record_ct, MAX(record_ct) AS record_ct, COUNT(*) AS column_ct - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, table_name, - functional_table_type, run_date ) UPDATE data_table_chars -SET functional_table_type = COALESCE(n.functional_table_type, d.functional_table_type), +SET approx_record_ct = n.approx_record_ct, record_ct = n.record_ct, column_ct = n.column_ct, last_refresh_date = n.run_date, @@ -38,34 +37,33 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name, - functional_table_type, run_date, + MAX(approx_record_ct) AS approx_record_ct, MAX(record_ct) AS record_ct, COUNT(*) AS column_ct - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, table_name, - functional_table_type, run_date ) INSERT INTO data_table_chars ( table_groups_id, schema_name, table_name, - functional_table_type, add_date, last_refresh_date, + approx_record_ct, record_ct, column_ct ) SELECT n.table_groups_id, n.schema_name, n.table_name, - n.functional_table_type, n.run_date, n.run_date, + n.approx_record_ct, n.record_ct, n.column_ct FROM new_chars n @@ -81,7 +79,7 @@ WITH new_chars AS ( SELECT table_groups_id, schema_name, table_name - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id, schema_name, @@ -90,7 +88,7 @@ WITH new_chars AS ( last_run AS ( SELECT table_groups_id, MAX(run_date) as last_run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id ) @@ -118,21 +116,17 @@ WITH new_chars AS ( table_name, column_name, position, - general_type, column_type, db_data_type, - functional_data_type, run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), update_chars AS ( UPDATE data_column_chars SET ordinal_position = n.position, - general_type = n.general_type, column_type = n.column_type, db_data_type = n.db_data_type, - functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type), last_mod_date = CASE WHEN n.db_data_type <> d.db_data_type THEN n.run_date ELSE d.last_mod_date END, drop_date = NULL FROM new_chars n @@ -172,9 +166,8 @@ WITH new_chars AS ( general_type, column_type, db_data_type, - functional_data_type, run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), inserted_records AS ( @@ -188,7 +181,6 @@ inserted_records AS ( general_type, column_type, db_data_type, - functional_data_type, add_date, last_mod_date ) @@ -201,7 +193,6 @@ inserted_records AS ( n.general_type, n.column_type, n.db_data_type, - n.functional_data_type, n.run_date, n.run_date FROM new_chars n @@ -237,13 +228,13 @@ WITH new_chars AS ( schema_name, table_name, column_name - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID ), last_run AS ( SELECT table_groups_id, MAX(run_date) as last_run_date - FROM {SOURCE_TABLE} + FROM stg_data_chars_updates WHERE table_groups_id = :TABLE_GROUPS_ID GROUP BY table_groups_id ), diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index ad76f02c..2df5365e 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -6,10 +6,6 @@ SET SEARCH_PATH TO {SCHEMA_NAME}; -- | This script should only be run for new schema -- no drops -- ============================================================================== -CREATE SEQUENCE test_definitions_cat_test_id_seq; - -CREATE SEQUENCE profile_results_dk_id_seq; - CREATE TABLE stg_secondary_profile_updates ( project_code VARCHAR(30), schema_name VARCHAR(50), @@ -30,18 +26,16 @@ CREATE TABLE stg_functional_table_updates ( ); CREATE TABLE stg_data_chars_updates ( - project_code VARCHAR(30), table_groups_id UUID, run_date TIMESTAMP, schema_name VARCHAR(120), table_name VARCHAR(120), - functional_table_type VARCHAR(50), column_name VARCHAR(120), position INTEGER, general_type VARCHAR(1), column_type VARCHAR(50), db_data_type VARCHAR(50), - functional_data_type VARCHAR(50), + approx_record_ct BIGINT, record_ct BIGINT ); @@ -76,6 +70,7 @@ CREATE TABLE connections ( url VARCHAR(200) default '', connect_by_url BOOLEAN default FALSE, connect_by_key BOOLEAN DEFAULT FALSE, + connect_with_identity BOOLEAN DEFAULT FALSE, private_key BYTEA, private_key_passphrase BYTEA, http_path VARCHAR(200), @@ -134,9 +129,12 @@ CREATE TABLE profiling_runs ( profiling_starttime TIMESTAMP, profiling_endtime TIMESTAMP, status VARCHAR(100) DEFAULT 'Running', + progress JSONB, log_message VARCHAR, table_ct BIGINT, column_ct BIGINT, + record_ct BIGINT, + data_point_ct BIGINT, anomaly_ct BIGINT, anomaly_table_ct BIGINT, anomaly_column_ct BIGINT, @@ -155,10 +153,8 @@ CREATE TABLE test_suites ( REFERENCES connections, table_groups_id UUID, test_suite_description VARCHAR(1000), - test_action VARCHAR(100), severity VARCHAR(10), export_to_observability VARCHAR(5) DEFAULT 'Y', - test_suite_schema VARCHAR(100), component_key VARCHAR(100), component_type VARCHAR(100), component_name VARCHAR(100), @@ -173,16 +169,14 @@ ALTER TABLE table_groups ADD CONSTRAINT table_groups_test_suites_monitor_test_su FOREIGN KEY (monitor_test_suite_id) REFERENCES test_suites ON DELETE SET NULL; CREATE TABLE test_definitions ( - id UUID DEFAULT gen_random_uuid(), - cat_test_id BIGINT GENERATED BY DEFAULT AS IDENTITY - CONSTRAINT test_definitions_cat_test_id_pk + id UUID DEFAULT gen_random_uuid() + CONSTRAINT test_definitions_id_pk PRIMARY KEY, table_groups_id UUID, profile_run_id UUID, test_type VARCHAR(200), test_suite_id UUID NOT NULL, test_description VARCHAR(1000), - test_action VARCHAR(100), schema_name VARCHAR(100), table_name VARCHAR(100), column_name VARCHAR(500), @@ -226,16 +220,10 @@ CREATE TABLE test_definitions ( FOREIGN KEY (test_suite_id) REFERENCES test_suites ); -ALTER SEQUENCE test_definitions_cat_test_id_seq OWNED BY test_definitions.cat_test_id; - CREATE TABLE profile_results ( id UUID DEFAULT gen_random_uuid() CONSTRAINT profile_results_id_pk PRIMARY KEY, - dk_id BIGINT GENERATED ALWAYS AS IDENTITY, --- CONSTRAINT profile_results_dk_id_pk --- PRIMARY KEY, - column_id UUID, project_code VARCHAR(30), connection_id BIGINT CONSTRAINT profile_results_connections_connection_id_fk @@ -307,12 +295,10 @@ CREATE TABLE profile_results ( pii_flag VARCHAR(50), functional_data_type VARCHAR(50), functional_table_type VARCHAR(50), - sample_ratio FLOAT + sample_ratio FLOAT, + query_error VARCHAR(2000) ); -ALTER SEQUENCE profile_results_dk_id_seq OWNED BY profile_results.dk_id; - - CREATE TABLE profile_anomaly_types ( id VARCHAR(10) NOT NULL CONSTRAINT pk_anomaly_types_id @@ -400,9 +386,9 @@ CREATE TABLE data_table_chars ( add_date TIMESTAMP, drop_date TIMESTAMP, last_refresh_date TIMESTAMP, + approx_record_ct BIGINT, record_ct BIGINT, column_ct BIGINT, - data_point_ct BIGINT GENERATED ALWAYS AS (record_ct * column_ct) STORED, last_complete_profile_run_id UUID, last_profile_record_ct BIGINT, dq_score_profiling FLOAT, @@ -512,8 +498,8 @@ CREATE TABLE test_runs ( test_starttime TIMESTAMP, test_endtime TIMESTAMP, status VARCHAR(100) DEFAULT 'Running', + progress JSONB, log_message TEXT, - duration VARCHAR(50), test_ct INTEGER, passed_ct INTEGER, failed_ct INTEGER, @@ -542,8 +528,6 @@ CREATE TABLE test_results ( test_definition_id UUID, auto_gen BOOLEAN, test_time TIMESTAMP, - starttime TIMESTAMP, - endtime TIMESTAMP, schema_name VARCHAR(100), table_name VARCHAR(100), column_names VARCHAR(500), @@ -552,15 +536,11 @@ CREATE TABLE test_results ( result_code INTEGER, severity VARCHAR(10), result_status VARCHAR(10), - result_message VARCHAR(1000), + result_message VARCHAR, result_signal VARCHAR(1000), result_measure VARCHAR(1000), threshold_value VARCHAR(1000), - result_error_data VARCHAR(4000), - test_action VARCHAR(100), disposition VARCHAR(20), - subset_condition VARCHAR(500), - result_query VARCHAR(4000), test_description VARCHAR(1000), test_run_id UUID NOT NULL, table_groups_id UUID, @@ -571,38 +551,6 @@ CREATE TABLE test_results ( FOREIGN KEY (test_suite_id) REFERENCES test_suites ); -CREATE TABLE working_agg_cat_tests ( - test_run_id UUID NOT NULL, - schema_name VARCHAR(200) NOT NULL, - table_name VARCHAR(200) NOT NULL, - cat_sequence INTEGER NOT NULL, - test_count INTEGER, - test_time TIMESTAMP, - start_time TIMESTAMP, - end_time TIMESTAMP, - column_names TEXT, - test_types TEXT, - test_definition_ids TEXT, - test_actions TEXT, - test_descriptions TEXT, - test_parms TEXT, - test_measures TEXT, - test_conditions TEXT, - CONSTRAINT working_agg_cat_tests_trid_sn_tn_cs - PRIMARY KEY (test_run_id, schema_name, table_name, cat_sequence) -); - -CREATE TABLE working_agg_cat_results ( - test_run_id UUID NOT NULL, - schema_name VARCHAR(200) NOT NULL, - table_name VARCHAR(200) NOT NULL, - cat_sequence INTEGER NOT NULL, - measure_results TEXT, - test_results TEXT, - CONSTRAINT working_agg_cat_results_tri_sn_tn_cs - PRIMARY KEY (test_run_id, schema_name, table_name, cat_sequence) -); - CREATE TABLE cat_test_conditions ( id VARCHAR, test_type VARCHAR(200) NOT NULL @@ -729,10 +677,6 @@ CREATE TABLE IF NOT EXISTS score_definition_results_breakdown ( CREATE UNIQUE INDEX table_groups_name_unique ON table_groups(project_code, table_groups_name); --- Index working table - ORIGINAL -CREATE INDEX working_agg_cat_tests_test_run_id_index - ON working_agg_cat_tests(test_run_id); - -- Index Connections CREATE UNIQUE INDEX uix_con_id ON connections(id); diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index 0eea3855..536edcee 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -22,30 +22,6 @@ INNER JOIN profile_results r ON p.id = r.profile_run_id; -DROP VIEW IF EXISTS v_latest_profile_anomalies; - -CREATE VIEW v_latest_profile_anomalies - AS -WITH last_profile_date - AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date - FROM profiling_runs - GROUP BY table_groups_id) -SELECT r.id, r.project_code, r.table_groups_id, - r.profile_run_id, pr.profiling_starttime as profile_run_date, - r.schema_name, r.table_name, r.column_name, r.column_type, - t.anomaly_name, t.anomaly_description, t.issue_likelihood, - r.detail, - t.suggested_action, r.disposition - FROM profile_anomaly_results r -INNER JOIN profile_anomaly_types t - ON r.anomaly_id = t.id -INNER JOIN profiling_runs pr - ON (r.profile_run_id = pr.id) -INNER JOIN last_profile_date l - ON (pr.table_groups_id = l.table_groups_id - AND pr.profiling_starttime = l.last_profile_run_date); - - DROP VIEW IF EXISTS v_inactive_anomalies; CREATE VIEW v_inactive_anomalies @@ -55,59 +31,6 @@ SELECT DISTINCT anomaly_id, table_groups_id, schema_name, table_name, column_nam WHERE disposition = 'Inactive'; -DROP VIEW IF EXISTS v_profiling_runs; - -CREATE VIEW v_profiling_runs - AS -SELECT r.id as profiling_run_id, - r.project_code, cc.connection_name, r.connection_id, r.table_groups_id, - tg.table_groups_name, - tg.table_group_schema as schema_name, - r.profiling_starttime as start_time, - r.profiling_endtime as end_time, - r.status, - r.log_message, - r.table_ct, - r.column_ct, - r.anomaly_ct, r.anomaly_table_ct, r.anomaly_column_ct, - process_id, r.dq_score_profiling - FROM profiling_runs r -INNER JOIN table_groups tg - ON r.table_groups_id = tg.id -INNER JOIN connections cc - ON r.connection_id = cc.connection_id -GROUP BY r.id, r.project_code, cc.connection_name, r.connection_id, - r.table_groups_id, tg.table_groups_name, tg.table_group_schema, - r.profiling_starttime, r.profiling_endtime, r.status; - - -DROP VIEW IF EXISTS v_test_runs; - -CREATE VIEW v_test_runs - AS -SELECT r.id as test_run_id, - p.project_code, - p.project_name, - ts.test_suite, - r.test_starttime, - TO_CHAR(r.test_endtime - r.test_starttime, 'HH24:MI:SS') as duration, - r.status, r.log_message, - COUNT(*) as test_ct, - SUM(result_code) as passed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) as failed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) as warning_ct, - r.process_id - FROM test_runs r -INNER JOIN test_suites ts - ON (r.test_suite_id = ts.id) -INNER JOIN projects p - ON (ts.project_code = p.project_code) -INNER JOIN test_results tr - ON (r.id = tr.test_run_id) -GROUP BY r.id, p.project_code, ts.test_suite, r.test_starttime, r.test_endtime, - r.process_id, r.status, r.log_message, p.project_name; - - DROP VIEW IF EXISTS v_test_results; CREATE VIEW v_test_results @@ -138,15 +61,13 @@ SELECT p.project_name, r.result_code as passed_ct, (1 - COALESCE(r.result_code, 0))::INTEGER as exception_ct, CASE - WHEN result_status = 'Warning' - AND result_message NOT ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Warning' THEN 1 END::INTEGER as warning_ct, CASE - WHEN result_status = 'Failed' - AND result_message NOT ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Failed' THEN 1 END::INTEGER as failed_ct, CASE - WHEN result_message ILIKE 'Inactivated%' THEN 1 + WHEN result_status = 'Error' THEN 1 END as execution_error_ct, p.project_code, r.table_groups_id, @@ -189,7 +110,6 @@ CREATE VIEW v_queued_observability_results SELECT p.project_name, cn.sql_flavor as component_tool, - ts.test_suite_schema as schema, cn.connection_name, cn.project_db, diff --git a/testgen/template/dbsetup/075_grant_role_rights.sql b/testgen/template/dbsetup/075_grant_role_rights.sql index f8fb631e..97a54b48 100644 --- a/testgen/template/dbsetup/075_grant_role_rights.sql +++ b/testgen/template/dbsetup/075_grant_role_rights.sql @@ -23,8 +23,6 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON {SCHEMA_NAME}.stg_secondary_profile_updates, {SCHEMA_NAME}.stg_data_chars_updates, {SCHEMA_NAME}.test_runs, - {SCHEMA_NAME}.working_agg_cat_results, - {SCHEMA_NAME}.working_agg_cat_tests, {SCHEMA_NAME}.functional_test_results, {SCHEMA_NAME}.connections, {SCHEMA_NAME}.table_groups, diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml index fae0ec4b..fc3bd2e8 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -32,7 +32,8 @@ profile_anomaly_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` - ORDER BY COUNT(*) DESC; + ORDER BY COUNT(*) DESC + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1287' test_id: '1015' @@ -40,7 +41,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1129' test_id: '1015' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1072' test_id: '1015' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1047' test_id: '1015' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1447' test_id: '1015' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1186' test_id: '1015' @@ -80,5 +81,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml index a7371d24..d7690240 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml @@ -34,7 +34,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -43,7 +43,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS DATE) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1126' test_id: '1012' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1069' test_id: '1012' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1044' test_id: '1012' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1444' test_id: '1012' @@ -85,7 +85,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1183' test_id: '1012' @@ -93,5 +93,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml index 12cccad4..9c600bac 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml @@ -34,7 +34,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -43,7 +43,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1125' test_id: '1011' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1068' test_id: '1011' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1043' test_id: '1011' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1443' test_id: '1011' @@ -85,7 +85,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1182' test_id: '1011' @@ -93,5 +93,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml index 987d9f06..7bdd0df6 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml @@ -42,7 +42,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -52,7 +52,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -62,7 +62,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) UNION ALL ( @@ -72,7 +72,7 @@ profile_anomaly_types: WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(CAST(`{COLUMN_NAME}` AS STRING), r'[a-z]', 'a'), r'[A-Z]', 'A'), r'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 5 + LIMIT {LIMIT_4} ) ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly @@ -82,7 +82,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( `{COLUMN_NAME}`::STRING, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1121' test_id: '1007' @@ -90,7 +90,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP 5 c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC; + WITH cte AS ( SELECT TRIM(value) AS top_pattern, ROW_NUMBER() OVER (ORDER BY CHARINDEX('| '+ TRIM(value) + ' |', '| ' + '{DETAIL_EXPRESSION}' + ' |' ) ASC) as row_num FROM STRING_SPLIT('{DETAIL_EXPRESSION}', '|') ) SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 4 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 6 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 8 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" UNION ALL SELECT DISTINCT TOP {LIMIT_4} c.top_pattern, a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte c WHERE c.row_num = 10 AND TRANSLATE(a."{COLUMN_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN') = c.top_pattern GROUP BY c.top_pattern, a."{COLUMN_NAME}" ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1064' test_id: '1007' @@ -98,7 +98,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5 ) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT 5) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1039' test_id: '1007' @@ -106,7 +106,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1439' test_id: '1007' @@ -114,7 +114,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) A UNION ALL SELECT B.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) B UNION ALL SELECT C.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) C UNION ALL SELECT D.* FROM ( SELECT TOP 5 DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC ) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM ( SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}", '[a-z]', 'a'),'[A-Z]', 'A'),'[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly - id: '1178' test_id: '1007' @@ -122,5 +122,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) B UNION ALL SELECT C.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) C UNION ALL SELECT D.* FROM (SELECT DISTINCT TOP 5 b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC) D ORDER BY top_pattern DESC, count DESC; + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml index aea55e63..f2a2adec 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: AND NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'.*\s(and|but|or|yet)\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1297' test_id: '1025' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '.*\\s(and|but|or|yet)\\s.*') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1139' test_id: '1025' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE ( "{COLUMN_NAME}" LIKE '%,%,%,%' OR "{COLUMN_NAME}" LIKE '%|%|%|%' OR "{COLUMN_NAME}" LIKE '%^%^%^%' OR "{COLUMN_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) AND NOT ( "{COLUMN_NAME}" LIKE '% and %' OR "{COLUMN_NAME}" LIKE '% but %' OR "{COLUMN_NAME}" LIKE '% or %' OR "{COLUMN_NAME}" LIKE '% yet %' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ',', '')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1082' test_id: '1025' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\s(and|but|or|yet)\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1057' test_id: '1025' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1457' test_id: '1025' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COLUMN_NAME}" !~ '\\s(and|but|or|yet)\\s' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1196' test_id: '1025' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml index b09f8700..6443d845 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE UPPER(CAST(`{COLUMN_NAME}` AS STRING)) = CAST(`{COLUMN_NAME}` AS STRING) GROUP BY `{COLUMN_NAME}` - LIMIT 20 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -37,7 +37,7 @@ profile_anomaly_types: WHERE CAST(`{COLUMN_NAME}` AS STRING) <> UPPER(CAST(`{COLUMN_NAME}` AS STRING)) AND CAST(`{COLUMN_NAME}` AS STRING) <> LOWER(CAST(`{COLUMN_NAME}` AS STRING)) GROUP BY `{COLUMN_NAME}` - LIMIT 20 + LIMIT {LIMIT_2} ); error_type: Profile Anomaly - id: '1262' @@ -48,11 +48,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE UPPER(`{COLUMN_NAME}`) = `{COLUMN_NAME}` - GROUP BY `{COLUMN_NAME}` LIMIT 20) + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` <> LOWER(`{COLUMN_NAME}`) - GROUP BY `{COLUMN_NAME}` LIMIT 20) + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1260' test_id: '1028' @@ -60,11 +60,11 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT_2} 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" UNION ALL - SELECT TOP 20 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT_2} 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -76,11 +76,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1258' test_id: '1028' @@ -90,11 +90,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1473' test_id: '1028' @@ -104,11 +104,11 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly - id: '1261' test_id: '1028' @@ -118,9 +118,9 @@ profile_anomaly_types: lookup_query: |- (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") - GROUP BY "{COLUMN_NAME}" LIMIT 20) + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml index 87576c2d..876661df 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1296' test_id: '1024' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1138' test_id: '1024' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1081' test_id: '1024' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1056' test_id: '1024' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1456' test_id: '1024' @@ -71,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1195' test_id: '1024' @@ -79,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml index 03c47fc1..400424a9 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1275' test_id: '1003' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1117' test_id: '1003' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1060' test_id: '1003' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1035' test_id: '1003' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1435' test_id: '1003' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1174' test_id: '1003' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml index a6dc9c91..4231f420 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1123' test_id: '1009' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1066' test_id: '1009' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1041' test_id: '1009' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1441' test_id: '1009' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1180' test_id: '1009' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml index f6bc2d42..9f3e805e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml @@ -40,7 +40,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' - ORDER BY data_type, table_name; + ORDER BY data_type, table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1277' test_id: '1005' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1119' test_id: '1005' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT TOP {LIMIT} column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; error_type: Profile Anomaly - id: '1062' test_id: '1005' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1037' test_id: '1005' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1437' test_id: '1005' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1176' test_id: '1005' @@ -88,5 +89,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml index 554a78b7..1ddee506 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml @@ -40,7 +40,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' - ORDER BY data_type, table_name; + ORDER BY data_type, table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1276' test_id: '1004' @@ -48,7 +49,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS STRING) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS STRING) || ',' || CAST(numeric_scale AS STRING) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1118' test_id: '1004' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT TOP {LIMIT} column_name, columns.table_name, CASE WHEN data_type = 'datetime' THEN 'datetime' WHEN data_type = 'datetime2' THEN 'datetime' WHEN data_type = 'varchar' THEN 'varchar(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'char' THEN 'char(' + CAST(character_maximum_length AS VARCHAR) + ')' WHEN data_type = 'numeric' THEN 'numeric(' + CAST(numeric_precision AS VARCHAR) + ',' + CAST(numeric_scale AS VARCHAR) + ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; error_type: Profile Anomaly - id: '1061' test_id: '1004' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1036' test_id: '1004' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, table_name, CASE WHEN data_type = 'timestamp without time zone' THEN 'timestamp' WHEN data_type = 'character varying' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'character' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'numeric' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1436' test_id: '1004' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename; + SELECT DISTINCT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type, tablename LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1175' test_id: '1004' @@ -88,5 +89,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name; + SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml index 29978d5c..87d80e61 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml @@ -28,7 +28,8 @@ profile_anomaly_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` - ORDER BY `{COLUMN_NAME}`; + ORDER BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1278' test_id: '1006' @@ -36,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1120' test_id: '1006' @@ -44,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1063' test_id: '1006' @@ -52,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1038' test_id: '1006' @@ -60,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1438' test_id: '1006' @@ -68,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1177' test_id: '1006' @@ -76,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml index 81d2d0ca..3cfd99ef 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: AND CAST(`{COLUMN_NAME}` AS STRING) = LOWER(CAST(`{COLUMN_NAME}` AS STRING)) AND CAST(`{COLUMN_NAME}` AS STRING) > '' GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1267' test_id: '1029' @@ -39,7 +39,7 @@ profile_anomaly_types: lookup_query: |- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` = UPPER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` = LOWER(`{COLUMN_NAME}`) AND `{COLUMN_NAME}` > '' - GROUP BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1265' test_id: '1029' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' GROUP BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -59,7 +59,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1263' test_id: '1029' @@ -69,7 +69,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1474' test_id: '1029' @@ -79,7 +79,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1266' test_id: '1029' @@ -89,5 +89,5 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' - GROUP BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml index 0281a7f0..dbaa2631 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -30,7 +30,7 @@ profile_anomaly_types: AND SUBSTR(CAST(`{COLUMN_NAME}` AS STRING), LENGTH(CAST(`{COLUMN_NAME}` AS STRING)), 1) <> '\'' GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1272' test_id: '1030' @@ -40,7 +40,7 @@ profile_anomaly_types: lookup_query: |- SELECT `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < 'A' AND LEFT(`{COLUMN_NAME}`, 1) NOT IN ('"', ' ') AND RIGHT(`{COLUMN_NAME}`, 1) <> '''' - GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1270' test_id: '1030' @@ -48,7 +48,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" error_type: Profile Anomaly @@ -60,7 +60,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1268' test_id: '1030' @@ -70,7 +70,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1475' test_id: '1030' @@ -80,7 +80,7 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1271' test_id: '1030' @@ -90,5 +90,5 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml index 6761e2bc..a6118bed 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -36,7 +36,7 @@ profile_anomaly_types: '\ufeff', '\x65279') as `{COLUMN_NAME}_content`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}` - GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500 + GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT} error_type: Profile Anomaly - id: '1275' test_id: '1031' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", + SELECT TOP {LIMIT} REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), '\x160'), NCHAR(8201), '\x8201'), NCHAR(8203), '\x8203'), @@ -79,7 +79,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1273' test_id: '1031' @@ -100,7 +100,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1476' test_id: '1031' @@ -121,7 +121,7 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly - id: '1276' test_id: '1031' @@ -142,5 +142,5 @@ profile_anomaly_types: CHR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" - GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500 + GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml index 6a115e85..839c9fc8 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml @@ -39,7 +39,8 @@ profile_anomaly_types: OR `{COLUMN_NAME}` IS NULL ) GROUP BY `{COLUMN_NAME}` - ORDER BY `{COLUMN_NAME}`; + ORDER BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1274' test_id: '1002' @@ -47,7 +48,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1116' test_id: '1002' @@ -55,7 +56,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?') OR "{COLUMN_NAME}" LIKE ' ' THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1059' test_id: '1002' @@ -63,7 +64,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1034' test_id: '1002' @@ -71,7 +72,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1434' test_id: '1002' @@ -79,7 +80,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1173' test_id: '1002' @@ -87,5 +88,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml index 20e6fc37..005957b5 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: GROUP BY `{COLUMN_NAME}` HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1288' test_id: '1016' @@ -37,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1130' test_id: '1016' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1073' test_id: '1016' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1048' test_id: '1016' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1448' test_id: '1016' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1187' test_id: '1016' @@ -77,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml index 652fc467..7efb6ed9 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1338' test_id: '1100' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1271' test_id: '1100' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1272' test_id: '1100' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1269' test_id: '1100' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1470' test_id: '1100' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1270' test_id: '1100' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml index c4a3499d..74a91f06 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: WHERE LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = '"' OR LEFT(CAST(`{COLUMN_NAME}` AS STRING), 1) = "'" GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1282' test_id: '1010' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE '"%"' OR `{COLUMN_NAME}` ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1124' test_id: '1010' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%"' OR "{COLUMN_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1067' test_id: '1010' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1042' test_id: '1010' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1442' test_id: '1010' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1181' test_id: '1010' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml index afb7893b..bd121c7c 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1128' test_id: '1014' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Profile Anomaly - id: '1071' test_id: '1014' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1046' test_id: '1014' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1446' test_id: '1014' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1185' test_id: '1014' @@ -68,5 +68,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml index 964d7eb8..381c26c1 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE (CASE WHEN `{COLUMN_NAME}` IN ('.', '?', ' ') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP '-{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '0{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP '9{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'x{2,}' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP 'z{2,}' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN `{COLUMN_NAME}` = '' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1127' test_id: '1013' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE '%..%' OR LOWER("{COLUMN_NAME}") LIKE '%--%' OR (LEN(REPLACE("{COLUMN_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), 'z', ''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1070' test_id: '1013' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1045' test_id: '1013' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1445' test_id: '1013' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1184' test_id: '1013' @@ -71,5 +71,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml index 9ef1f377..3b7f394e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) UNION ALL ( @@ -40,7 +40,7 @@ profile_anomaly_types: WHERE SAFE_CAST(CAST(`{COLUMN_NAME}` AS STRING) AS FLOAT64) IS NULL GROUP BY `{COLUMN_NAME}` ORDER BY count DESC - LIMIT 10 + LIMIT {LIMIT_2} ) ORDER BY data_type, count DESC; error_type: Profile Anomaly @@ -50,7 +50,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT {LIMIT_2}; error_type: Profile Anomaly - id: '1137' test_id: '1023' @@ -58,7 +58,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP {LIMIT_2} 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1080' test_id: '1023' @@ -66,7 +66,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1055' test_id: '1023' @@ -74,7 +74,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1455' test_id: '1023' @@ -82,7 +82,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT A.* FROM ( SELECT TOP 10 DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM ( SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly - id: '1194' test_id: '1023' @@ -90,5 +90,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT A.* FROM (SELECT DISTINCT TOP 10 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC; + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml index 4eb691ab..4f7b457b 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -37,7 +37,7 @@ profile_anomaly_types: ON UPPER(REGEXP_REPLACE(CAST(a.`{COLUMN_NAME}` AS STRING), r"[ '\.\-\,]", '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, b.possible_standard_value ORDER BY b.possible_standard_value ASC, count DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1289' test_id: '1017' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1131' test_id: '1017' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT TOP 500 UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; + WITH CTE AS ( SELECT DISTINCT TOP {LIMIT} UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; error_type: Profile Anomaly - id: '1074' test_id: '1017' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1049' test_id: '1017' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1449' test_id: '1017' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1188' test_id: '1017' @@ -85,5 +85,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml index 9763b988..0016e44d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml @@ -28,7 +28,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1273' test_id: '1001' @@ -36,7 +36,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1115' test_id: '1001' @@ -44,7 +44,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Profile Anomaly - id: '1058' test_id: '1001' @@ -52,7 +52,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1033' test_id: '1001' @@ -60,7 +60,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1433' test_id: '1001' @@ -68,7 +68,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1172' test_id: '1001' @@ -76,5 +76,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml index f8ea4cee..e31fd5dc 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml @@ -37,7 +37,8 @@ profile_anomaly_types: WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' - ORDER BY table_name; + ORDER BY table_name + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1280' test_id: '1008' @@ -48,7 +49,7 @@ profile_anomaly_types: \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ - \ TABLE' ORDER BY table_name; " + \ TABLE' ORDER BY table_name LIMIT {LIMIT};" error_type: Profile Anomaly - id: '1122' test_id: '1008' @@ -56,7 +57,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name; + SELECT TOP {LIMIT} column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name; error_type: Profile Anomaly - id: '1065' test_id: '1008' @@ -64,7 +65,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name; + SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY columns.table_name LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1040' test_id: '1008' @@ -72,7 +73,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type; + SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1440' test_id: '1008' @@ -80,7 +81,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type; + SELECT columnname AS column_name, tablename AS table_name, external_type AS data_type FROM svv_external_columns WHERE schemaname = '{TARGET_SCHEMA}' AND columnname = '{COLUMN_NAME}' ORDER BY external_type LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1179' test_id: '1008' @@ -91,5 +92,5 @@ profile_anomaly_types: \ JOIN information_schema.tables ON columns.table_name = tables.table_name AND\ \ columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}'\ \ AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE\ - \ TABLE' ORDER BY table_name; " + \ TABLE' ORDER BY table_name LIMIT {LIMIT};" error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml index a8574f95..1c5bbf16 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -27,7 +27,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1294' test_id: '1022' @@ -35,7 +35,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1136' test_id: '1022' @@ -43,7 +43,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1079' test_id: '1022' @@ -51,7 +51,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1054' test_id: '1022' @@ -59,7 +59,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1454' test_id: '1022' @@ -67,7 +67,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1193' test_id: '1022' @@ -75,5 +75,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml index 04790269..68e6e2e1 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -29,7 +29,7 @@ profile_anomaly_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1293' test_id: '1021' @@ -37,7 +37,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1135' test_id: '1021' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1078' test_id: '1021' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1053' test_id: '1021' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1453' test_id: '1021' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1192' test_id: '1021' @@ -77,5 +77,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml index 23dc70fb..ea033f96 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -31,7 +31,7 @@ profile_anomaly_types: OR (CAST(`{COLUMN_NAME}` AS DATE) > DATE_ADD(CAST(CAST('{PROFILE_RUN_DATE}' AS DATETIME) AS DATE), INTERVAL 30 YEAR)) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1290' test_id: '1018' @@ -39,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a WHERE (`{COLUMN_NAME}` < '1900-01-01'::DATE) OR (`{COLUMN_NAME}` > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1132' test_id: '1018' @@ -47,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", CAST( '{PROFILE_RUN_DATE}' AS DATE) AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < CAST('1900-01-01' AS DATE) ) OR ("{COLUMN_NAME}" > DATEADD(YEAR, 30, CAST('{PROFILE_RUN_DATE}' AS DATE ))) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Profile Anomaly - id: '1075' test_id: '1018' @@ -55,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1050' test_id: '1018' @@ -63,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1450' test_id: '1018' @@ -71,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1189' test_id: '1018' @@ -79,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml index a935d7fb..7ba71123 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml @@ -30,7 +30,8 @@ profile_anomaly_types: WHERE LOWER(CAST(`{COLUMN_NAME}` AS STRING)) IN ( SELECT TRIM(val) FROM UNNEST(SPLIT(SUBSTR('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) AS val ) - GROUP BY `{COLUMN_NAME}`; + GROUP BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1230' test_id: '1027' @@ -38,7 +39,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}`; + SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '\\|')) AS value)) GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1231' test_id: '1027' @@ -46,7 +47,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}"; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT trim(value) FROM STRING_SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', CHARINDEX(':', '{DETAIL_EXPRESSION}') + 2, 999), '|')) GROUP BY "{COLUMN_NAME}"; error_type: Profile Anomaly - id: '1232' test_id: '1027' @@ -54,7 +55,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}"; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") = ANY(STRING_TO_ARRAY(SUBSTRING('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|')) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1229' test_id: '1027' @@ -62,7 +63,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1458' test_id: '1027' @@ -70,7 +71,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}"; + WITH val_array AS (SELECT 1 as valkey, SPLIT_TO_ARRAY(SUBSTRING ('{DETAIL_EXPRESSION}', STRPOS('{DETAIL_EXPRESSION}', ':') + 2), '|') vals), val_list AS ( SELECT valkey, val::VARCHAR FROM val_array v, v.vals val ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" t INNER JOIN val_list v ON (LOWER("{COLUMN_NAME}") = v.val) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1230' test_id: '1027' @@ -78,5 +79,5 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}"; + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml index 57b2901a..5b277a5e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml @@ -62,7 +62,8 @@ test_types: GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1333' test_id: '1500' @@ -86,7 +87,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1500' @@ -94,7 +96,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -134,7 +136,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1500' @@ -158,7 +161,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1462' test_id: '1500' @@ -182,7 +186,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1500' @@ -206,7 +211,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2506' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml index fcc0487c..84b28ecf 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml @@ -64,7 +64,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1248' test_id: '1504' @@ -90,7 +91,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1504' @@ -98,7 +100,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -142,7 +144,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1504' @@ -168,7 +171,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1466' test_id: '1504' @@ -194,7 +198,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1504' @@ -220,7 +225,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2509' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml index 320ccc37..b4b03bc1 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml @@ -64,7 +64,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1505' @@ -90,7 +91,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1247' test_id: '1505' @@ -98,7 +100,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total @@ -142,7 +144,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1245' test_id: '1505' @@ -168,7 +171,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1467' test_id: '1505' @@ -194,7 +198,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1246' test_id: '1505' @@ -220,7 +225,8 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2510' diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml index 58462bf0..e5355a76 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml @@ -62,7 +62,8 @@ test_types: GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1334' test_id: '1501' @@ -86,7 +87,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1251' test_id: '1501' @@ -94,7 +96,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total @@ -134,7 +136,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1249' test_id: '1501' @@ -158,7 +161,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1463' test_id: '1501' @@ -182,7 +186,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1250' test_id: '1501' @@ -206,7 +211,8 @@ test_types: {MATCH_HAVING_CONDITION} ) a GROUP BY {GROUPBY_NAMES} ) s WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2502' diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml index 97f00d83..3e9297e5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml @@ -113,7 +113,7 @@ test_types: ) a WHERE LENGTH(CAST(`{COLUMN_NAME}` AS STRING)) = a.max_length AND a.max_length < {THRESHOLD_VALUE} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1298' test_id: '1004' @@ -121,7 +121,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1140' test_id: '1004' @@ -129,7 +129,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} ; error_type: Test Results - id: '1083' test_id: '1004' @@ -137,7 +137,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1001' test_id: '1004' @@ -145,7 +145,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1401' test_id: '1004' @@ -153,7 +153,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1197' test_id: '1004' @@ -161,6 +161,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml index 3630cc0c..2c02c157 100644 --- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml @@ -57,7 +57,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1335' test_id: '1502' @@ -78,7 +79,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1255' test_id: '1502' @@ -86,7 +88,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT * + SELECT TOP {LIMIT} * FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -120,7 +122,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1253' test_id: '1502' @@ -141,7 +144,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1464' test_id: '1502' @@ -162,7 +166,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1254' test_id: '1502' @@ -183,7 +188,8 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test - ORDER BY {COLUMN_NAME_NO_QUOTES}; + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2501' diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml index 6f01a0b5..fcde8abd 100644 --- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml @@ -110,7 +110,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1300' test_id: '1006' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1142' test_id: '1006' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY}; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY}; error_type: Test Results - id: '1085' test_id: '1006' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1003' test_id: '1006' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1403' test_id: '1006' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results - id: '1199' test_id: '1006' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml index b9e5033f..67521638 100644 --- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1301' test_id: '1007' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` <> {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1143' test_id: '1007' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1086' test_id: '1007' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1004' test_id: '1007' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1404' test_id: '1007' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1200' test_id: '1007' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index d0d0ca27..7f341c3f 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -141,7 +141,7 @@ test_types: LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1302' test_id: '1009' @@ -149,7 +149,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT 500; + WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1144' test_id: '1009' @@ -188,7 +188,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -205,7 +205,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 day') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1005' test_id: '1009' @@ -213,7 +213,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1405' test_id: '1009' @@ -221,7 +221,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_date_count, MIN(c.period) AS next_available_date, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_date_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1201' test_id: '1009' @@ -229,6 +229,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT 500; + WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml index b7554ca6..ffa38aa9 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -110,7 +110,7 @@ test_types: SELECT DISTINCT LENGTH(SPLIT(CAST(`{COLUMN_NAME}` AS STRING), '.')[SAFE_OFFSET(1)]) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1303' test_id: '1011' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART(`{COLUMN_NAME}`::STRING, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results - id: '1145' test_id: '1011' @@ -130,7 +130,7 @@ test_types: SELECT LEN(SUBSTRING(CAST(ABS("{COLUMN_NAME}") % 1 AS VARCHAR), 3, LEN("{COLUMN_NAME}"))) AS decimal_scale FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ) - SELECT DISTINCT TOP 500 decimal_scale, COUNT(*) AS count + SELECT DISTINCT TOP {LIMIT} decimal_scale, COUNT(*) AS count FROM cte GROUP BY decimal_scale ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1088' @@ -139,7 +139,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results - id: '1006' test_id: '1011' @@ -147,7 +147,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT {LIMIT}; error_type: Test Results - id: '1406' test_id: '1011' @@ -155,7 +155,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT 500; + SELECT DISTINCT DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DECIMAL_SCALE("{COLUMN_NAME}" :: SUPER) LIMIT {LIMIT}; error_type: Test Results - id: '1202' test_id: '1011' @@ -163,6 +163,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT 500; + SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml index e27cdb93..1762b558 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml @@ -112,7 +112,7 @@ test_types: WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1304' test_id: '1012' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Test Results - id: '1146' test_id: '1012' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1089' test_id: '1012' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1007' test_id: '1012' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1407' test_id: '1012' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1203' test_id: '1012' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml index d0382cb3..9e43a2b1 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml @@ -111,7 +111,7 @@ test_types: WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1305' test_id: '1013' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NOT NULL GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT {LIMIT}; error_type: Test Results - id: '1147' test_id: '1013' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1090' test_id: '1013' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1008' test_id: '1013' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1408' test_id: '1013' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results - id: '1204' test_id: '1013' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml index cd7f6c04..8b5bcce2 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -52,7 +52,8 @@ test_types: GROUP BY {CONCAT_COLUMNS} ) SELECT * - FROM latest_ver; + FROM latest_ver + LIMIT {LIMIT}; error_type: Test Results - id: '1336' test_id: '1503' @@ -79,6 +80,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1259' test_id: '1503' @@ -98,13 +100,13 @@ test_types: FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} ) - SELECT COALESCE(l.category, o.category) AS category, + SELECT TOP {LIMIT} COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) - ORDER BY COALESCE(l.category, o.category) + ORDER BY COALESCE(l.category, o.category); error_type: Test Results - id: '1260' test_id: '1503' @@ -131,6 +133,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1257' test_id: '1503' @@ -157,6 +160,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1465' test_id: '1503' @@ -183,6 +187,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results - id: '1258' test_id: '1503' @@ -209,6 +214,7 @@ test_types: FULL JOIN older_ver o ON (l.category = o.category) ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2503' diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml index af83785c..a186f74d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -48,7 +48,8 @@ test_types: WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 - ORDER BY {GROUPBY_NAMES}; + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1257' test_id: '1510' @@ -62,6 +63,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1255' test_id: '1510' @@ -69,7 +71,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + SELECT TOP {LIMIT} {GROUPBY_NAMES}, COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} @@ -88,6 +90,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1253' test_id: '1510' @@ -101,6 +104,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1472' test_id: '1510' @@ -114,6 +118,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results - id: '1254' test_id: '1510' @@ -127,6 +132,7 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2511' diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml index 928f9815..7cebba6e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$') GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1306' test_id: '1014' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1148' test_id: '1014' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" NOT LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1091' test_id: '1014' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1009' test_id: '1014' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1409' test_id: '1014' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1205' test_id: '1014' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml index 3f02dd24..5aab6fc8 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -109,7 +109,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE DATETIME_DIFF(`{COLUMN_NAME}`, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), DAY) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1307' test_id: '1015' @@ -117,7 +117,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1149' test_id: '1015' @@ -125,7 +125,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1092' test_id: '1015' @@ -133,7 +133,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1010' test_id: '1015' @@ -141,7 +141,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1410' test_id: '1015' @@ -149,7 +149,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1206' test_id: '1015' @@ -157,6 +157,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml index 0ce7f4a8..a11cebaf 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE DATETIME_DIFF(`{COLUMN_NAME}`, DATE_ADD(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), INTERVAL 365 DAY), DAY) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1308' test_id: '1016' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1150' test_id: '1016' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1093' test_id: '1016' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1011' test_id: '1016' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1411' test_id: '1016' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1207' test_id: '1016' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml index 96b8e33b..36686814 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` ) WHERE lov <> '{THRESHOLD_VALUE}' - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1310' test_id: '1018' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1152' test_id: '1018' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; + WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT TOP {LIMIT} STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; error_type: Test Results - id: '1095' test_id: '1018' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1013' test_id: '1018' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1413' test_id: '1018' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1209' test_id: '1018' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT 500; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml index 66567768..6f2aa126 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -18,7 +18,111 @@ test_types: column_name_help: null default_parm_columns: baseline_value,threshold_value default_parm_values: |- - '(' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, '|' , 2) > '' THEN ',''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, '|' , 2), '''' , '''''' ) ) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 4) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 4), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 6) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 6), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 8) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 8), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 10) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 10), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 12) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 12), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 14) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 14), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 16) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 16), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 18) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 18), '''' , '''''' )) || '''' ELSE '' END || CASE WHEN SPLIT_PART(top_freq_values, '|' , 20) > '' THEN ',''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, '|' , 20), '''' , '''''' )) || '''' ELSE '' END, 2, 999) || ')',0 + '(' || SUBSTRING( + CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 1) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 1), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 2) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 2), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 3) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 3), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 4) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 4), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 5) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 5), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 6) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 6), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 7) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 7), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 8) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 8), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 9) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 9), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END || CASE + WHEN SPLIT_PART(top_freq_values, E'\n', 10) > '' THEN ',''' || TRIM( + REPLACE ( + SPLIT_PART(SPLIT_PART(top_freq_values, E'\n', 10), ' | ', 1), + '''', + '''''' + ), + '| ' + ) || '''' + ELSE '' + END, + 2, + 999 + ) || ')',0 default_parm_prompts: |- List of Expected Values,Threshold Error Count default_parm_help: null @@ -110,7 +214,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1311' test_id: '1019' @@ -118,7 +222,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1153' test_id: '1019' @@ -126,7 +230,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ; + SELECT DISTINCT TOP {LIMIT} NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" ; error_type: Test Results - id: '1096' test_id: '1019' @@ -134,7 +238,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1014' test_id: '1019' @@ -142,7 +246,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1414' test_id: '1019' @@ -150,7 +254,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1210' test_id: '1019' @@ -158,6 +262,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml index 939dc27b..698d63a3 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -110,7 +110,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CAST(`{COLUMN_NAME}` AS DATE) < CAST(CAST('{BASELINE_VALUE}' AS DATETIME) AS DATE) GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1312' test_id: '1020' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1154' test_id: '1020' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) < CAST('{BASELINE_VALUE}' AS DATE) GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1097' test_id: '1020' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1015' test_id: '1020' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1415' test_id: '1020' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1211' test_id: '1020' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml index 8563d339..ea5b7d56 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml @@ -109,7 +109,7 @@ test_types: SELECT DISTINCT `{COLUMN_NAME}`, (ABS(CAST(`{COLUMN_NAME}` AS NUMERIC)) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE CAST(`{COLUMN_NAME}` AS NUMERIC) < {BASELINE_VALUE} - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1313' test_id: '1021' @@ -117,7 +117,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1155' test_id: '1021' @@ -125,7 +125,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE}; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE}; error_type: Test Results - id: '1098' test_id: '1021' @@ -133,7 +133,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1016' test_id: '1021' @@ -141,7 +141,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1416' test_id: '1021' @@ -149,7 +149,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1212' test_id: '1021' @@ -157,6 +157,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml index 67069e25..7598d6ed 100644 --- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml @@ -110,7 +110,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR CAST(`{COLUMN_NAME}` AS STRING) = '' - LIMIT 10; + LIMIT {LIMIT}; error_type: Test Results - id: '1314' test_id: '1022' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT 10; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL OR `{COLUMN_NAME}` :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1156' test_id: '1022' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = ''; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR CAST("{COLUMN_NAME}" AS VARCHAR(255)) = ''; error_type: Test Results - id: '1099' test_id: '1022' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT 10; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1017' test_id: '1022' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1417' test_id: '1022' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results - id: '1213' test_id: '1022' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 10 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' ; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml index af459f04..0f155edc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -136,7 +136,8 @@ test_types: FROM p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) - ORDER BY p.missing_period; + ORDER BY p.missing_period + LIMIT {LIMIT}; error_type: Test Results - id: '1315' test_id: '1023' @@ -144,7 +145,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period; + WITH daterange AS( SELECT explode( sequence( date_trunc('month', (SELECT MIN(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('month', (SELECT MAX(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 month) ) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('month', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('month', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_month = e.period LEFT JOIN existing_periods f ON p.next_available_month = f.period ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1157' test_id: '1023' @@ -183,7 +184,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -200,7 +201,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates :: DATE + INTERVAL '1 month') :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1018' test_id: '1023' @@ -208,7 +209,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1418' test_id: '1023' @@ -216,7 +217,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_month_count, MIN(c.period) AS next_available_month, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_month_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1214' test_id: '1023' @@ -224,6 +225,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml index 03f123e6..84d0052b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml @@ -109,7 +109,8 @@ test_types: SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NOT REGEXP_CONTAINS(NULLIF(CAST(`{COLUMN_NAME}` AS STRING), ''), r'{BASELINE_VALUE}') - GROUP BY `{COLUMN_NAME}`; + GROUP BY `{COLUMN_NAME}` + LIMIT {LIMIT}; error_type: Test Results - id: '1318' test_id: '1026' @@ -117,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}`; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(NULLIF(`{COLUMN_NAME}`::STRING, ''),'{BASELINE_VALUE}') != 1 GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1160' test_id: '1026' @@ -125,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT LIKE '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1103' test_id: '1026' @@ -133,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1021' test_id: '1026' @@ -141,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1421' test_id: '1026' @@ -149,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT SIMILAR TO '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1217' test_id: '1026' @@ -157,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml index 69aedb37..278eb9d4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -109,7 +109,8 @@ test_types: lookup_query: |- SELECT DISTINCT col AS latest_date_available, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}) AS test_run_date FROM (SELECT DATE_TRUNC(MAX(`{COLUMN_NAME}`), DAY) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - WHERE DATETIME_DIFF(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE}; + WHERE DATETIME_DIFF(CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}), col, DAY) > {THRESHOLD_VALUE} + LIMIT {LIMIT}; error_type: Test Results - id: '1319' test_id: '1028' @@ -117,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1161' test_id: '1028' @@ -125,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; + SELECT DISTINCT TOP {LIMIT} col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; error_type: Test Results - id: '1104' test_id: '1028' @@ -133,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1022' test_id: '1028' @@ -141,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1422' test_id: '1028' @@ -149,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results - id: '1218' test_id: '1028' @@ -157,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE}; + SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml index 1149fbb5..ada30dfe 100644 --- a/testgen/template/dbsetup_test_types/test_types_Required.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml @@ -108,7 +108,7 @@ test_types: SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1320' test_id: '1030' @@ -116,7 +116,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL LIMIT 500; + SELECT * FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1162' test_id: '1030' @@ -124,7 +124,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 500 * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL; + SELECT TOP {LIMIT} * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL; error_type: Test Results - id: '1105' test_id: '1030' @@ -132,7 +132,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1023' test_id: '1030' @@ -140,7 +140,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1423' test_id: '1030' @@ -148,7 +148,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results - id: '1219' test_id: '1030' @@ -156,6 +156,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT 500; + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml index 759e5a34..0fb0a904 100644 --- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -112,7 +112,7 @@ test_types: WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[0-9]{1,5}[A-Za-z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[A-Za-z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1323' test_id: '1033' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1165' test_id: '1033' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1108' test_id: '1033' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1026' test_id: '1033' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1426' test_id: '1033' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1222' test_id: '1033' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml index dd72a774..746913cb 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml @@ -55,7 +55,8 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) - GROUP BY {COLUMN_NAME_NO_QUOTES}; + GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1263' test_id: '1508' @@ -63,7 +64,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT} {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -94,6 +95,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1261' test_id: '1508' @@ -113,6 +115,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1468' test_id: '1508' @@ -132,6 +135,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results - id: '1262' test_id: '1508' @@ -151,6 +155,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT}; error_type: Test Results test_templates: - id: '2507' diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml index af62dff3..8f6d9362 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml @@ -53,6 +53,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL 2 * {WINDOW_DAYS} DAY) AND {WINDOW_DATE_COLUMN} < DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) + LIMIT {LIMIT_2} ) UNION ALL ( @@ -66,7 +67,8 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_SUB((SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), INTERVAL {WINDOW_DAYS} DAY) - ); + LIMIT {LIMIT_2} + ) error_type: Test Results - id: '1337' test_id: '1509' @@ -85,6 +87,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -98,6 +101,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1267' @@ -107,7 +111,7 @@ test_types: lookup_type: null lookup_query: |2- ( - SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT_2} 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -120,7 +124,7 @@ test_types: ) UNION ALL ( - SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + SELECT TOP {LIMIT_2} 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) @@ -149,6 +153,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -162,6 +167,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1265' @@ -181,6 +187,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -194,6 +201,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1469' @@ -213,6 +221,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -226,6 +235,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results - id: '1266' @@ -245,6 +255,7 @@ test_types: WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) UNION ALL ( @@ -258,6 +269,7 @@ test_types: FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + LIMIT {LIMIT_2} ) error_type: Test Results test_templates: diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml index f2d22996..c9d51c5d 100644 --- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml +++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml @@ -111,7 +111,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1324' test_id: '1036' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NULLIF(`{COLUMN_NAME}`, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1166' test_id: '1036' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1109' test_id: '1036' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1027' test_id: '1036' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1427' test_id: '1036' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1223' test_id: '1036' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml index c9cc6ca9..61eabf82 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml @@ -112,7 +112,7 @@ test_types: GROUP BY `{COLUMN_NAME}` HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1325' test_id: '1034' @@ -120,7 +120,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1167' test_id: '1034' @@ -128,7 +128,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1110' test_id: '1034' @@ -136,7 +136,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1028' test_id: '1034' @@ -144,7 +144,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1428' test_id: '1034' @@ -152,7 +152,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1224' test_id: '1034' @@ -160,6 +160,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml index 7665c977..374a4d50 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml @@ -111,7 +111,7 @@ test_types: FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC - LIMIT 500; + LIMIT {LIMIT}; error_type: Test Results - id: '1326' test_id: '1035' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT {LIMIT}; error_type: Test Results - id: '1168' test_id: '1035' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1111' test_id: '1035' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1029' test_id: '1035' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1429' test_id: '1035' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results - id: '1225' test_id: '1035' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml index fdef7072..4d5f876d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml @@ -115,7 +115,7 @@ test_types: OR CAST(`{COLUMN_NAME}` AS STRING) LIKE '"%' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1330' test_id: '1043' @@ -123,7 +123,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`, '.*[[:cntrl:]].*') OR `{COLUMN_NAME}`::STRING LIKE ' %' OR `{COLUMN_NAME}`::STRING LIKE '''%''' OR `{COLUMN_NAME}`::STRING LIKE '"%"' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1235' test_id: '1043' @@ -131,7 +131,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) || NCHAR(8203) || NCHAR(65279) || NCHAR(8239) || NCHAR(8201) || NCHAR(12288) || NCHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; error_type: Test Results - id: '1234' test_id: '1043' @@ -139,7 +139,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1233' test_id: '1043' @@ -147,7 +147,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1459' test_id: '1043' @@ -155,7 +155,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8203) || CHR(65279) || CHR(8239) || CHR(8201) || CHR(12288) || CHR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1236' test_id: '1043' @@ -163,6 +163,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml index 6e8929c5..6c08cc73 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml @@ -110,7 +110,7 @@ test_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1331' test_id: '1044' @@ -118,7 +118,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1239' test_id: '1044' @@ -126,7 +126,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Test Results - id: '1238' test_id: '1044' @@ -134,7 +134,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1237' test_id: '1044' @@ -142,7 +142,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1460' test_id: '1044' @@ -150,7 +150,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1240' test_id: '1044' @@ -158,6 +158,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml index acba07f0..ab616fd8 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml @@ -111,7 +111,7 @@ test_types: WHERE TRANSLATE(CAST(`{COLUMN_NAME}` AS STRING), '012345678', '999999999') != '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC - LIMIT 20; + LIMIT {LIMIT}; error_type: Test Results - id: '1332' test_id: '1045' @@ -119,7 +119,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20; + SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`,'012345678','999999999') <> '999' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1243' test_id: '1045' @@ -127,7 +127,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; error_type: Test Results - id: '1242' test_id: '1045' @@ -135,7 +135,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1241' test_id: '1045' @@ -143,7 +143,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1461' test_id: '1045' @@ -151,7 +151,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results - id: '1244' test_id: '1045' @@ -159,6 +159,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC; + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index c774e4df..8217f3ad 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -136,7 +136,8 @@ test_types: FROM p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) - ORDER BY p.missing_period; + ORDER BY p.missing_period + LIMIT {LIMIT}; error_type: Test Results - id: '1327' test_id: '1037' @@ -144,7 +145,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period; + WITH daterange AS( SELECT explode(sequence( date_trunc('week', (SELECT min(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), date_trunc('week', (SELECT max(`{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`)), interval 1 week)) AS all_dates ), existing_periods AS ( SELECT DISTINCT date_trunc('week', `{COLUMN_NAME}`) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY date_trunc('week', `{COLUMN_NAME}`) ) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM ( SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON p.prior_available_week = e.period LEFT JOIN existing_periods f ON p.next_available_week = f.period ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1169' test_id: '1037' @@ -183,7 +184,7 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT check_period, record_ct, + SELECT TOP {LIMIT} check_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' @@ -200,7 +201,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week' , MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week', "{COLUMN_NAME}") :: DATE) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1030' test_id: '1037' @@ -208,7 +209,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1430' test_id: '1037' @@ -216,7 +217,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, (SELECT period_count FROM existing_periods WHERE period = MAX(b.period) ) AS prior_available_week_count, MIN(c.period) AS next_available_week, (SELECT period_count FROM existing_periods WHERE period = MIN(c.period) ) AS next_available_week_count FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ORDER BY d.all_dates LIMIT {LIMIT}; error_type: Test Results - id: '1226' test_id: '1037' @@ -224,6 +225,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period; + WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbupgrade/0157_incremental_upgrade.sql b/testgen/template/dbupgrade/0157_incremental_upgrade.sql new file mode 100644 index 00000000..99f792b0 --- /dev/null +++ b/testgen/template/dbupgrade/0157_incremental_upgrade.sql @@ -0,0 +1,25 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +DROP VIEW IF EXISTS v_latest_profile_results CASCADE; +DROP VIEW IF EXISTS v_latest_profile_anomalies; +DROP VIEW IF EXISTS v_profiling_runs; +DROP VIEW IF EXISTS v_test_runs; + +ALTER TABLE stg_data_chars_updates + DROP COLUMN project_code, + DROP COLUMN functional_table_type, + DROP COLUMN functional_data_type, + ADD COLUMN approx_record_ct BIGINT; + +ALTER TABLE data_table_chars + ADD COLUMN approx_record_ct BIGINT, + DROP COLUMN data_point_ct; + +ALTER TABLE profiling_runs + ADD COLUMN progress JSONB, + ADD COLUMN record_ct BIGINT, + ADD COLUMN data_point_ct BIGINT; + +ALTER TABLE profile_results + DROP COLUMN column_id, + ADD COLUMN query_error VARCHAR(2000); diff --git a/testgen/template/dbupgrade/0158_incremental_upgrade.sql b/testgen/template/dbupgrade/0158_incremental_upgrade.sql new file mode 100644 index 00000000..e33a6b2e --- /dev/null +++ b/testgen/template/dbupgrade/0158_incremental_upgrade.sql @@ -0,0 +1,44 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +DROP VIEW IF EXISTS v_latest_profile_results CASCADE; +DROP VIEW IF EXISTS v_queued_observability_results CASCADE; +DROP VIEW IF EXISTS v_test_results CASCADE; + +DROP SEQUENCE profile_results_dk_id_seq; +DROP SEQUENCE test_definitions_cat_test_id_seq; + +DROP TABLE working_agg_cat_tests; +DROP TABLE working_agg_cat_results; + +ALTER TABLE profile_results + DROP COLUMN dk_id; + +ALTER TABLE test_suites + DROP COLUMN test_action, + DROP COLUMN test_suite_schema; + +ALTER TABLE test_definitions + DROP CONSTRAINT test_definitions_cat_test_id_pk, + DROP COLUMN cat_test_id, + DROP COLUMN test_action, + ADD CONSTRAINT test_definitions_id_pk PRIMARY KEY (id); + +ALTER TABLE test_runs + DROP COLUMN duration, + ADD COLUMN progress JSONB; + +ALTER TABLE test_results + ALTER COLUMN result_message TYPE VARCHAR, + DROP COLUMN starttime, + DROP COLUMN endtime, + DROP COLUMN test_action, + DROP COLUMN subset_condition, + DROP COLUMN result_error_data, + DROP COLUMN result_query; + +UPDATE job_schedules + SET kwargs = jsonb_build_object('test_suite_id', test_suites.id) +FROM test_suites +WHERE job_schedules.key = 'run-tests' + AND job_schedules.kwargs->>'project_key' = test_suites.project_code + AND job_schedules.kwargs->>'test_suite_key' = test_suites.test_suite; diff --git a/testgen/template/dbupgrade/0159_incremental_upgrade.sql b/testgen/template/dbupgrade/0159_incremental_upgrade.sql new file mode 100644 index 00000000..bf28b8ef --- /dev/null +++ b/testgen/template/dbupgrade/0159_incremental_upgrade.sql @@ -0,0 +1,2 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; +ALTER TABLE connections ADD COLUMN connect_with_identity BOOLEAN DEFAULT FALSE; diff --git a/testgen/template/dbupgrade/0160_incremental_upgrade.sql b/testgen/template/dbupgrade/0160_incremental_upgrade.sql new file mode 100644 index 00000000..8026a5b3 --- /dev/null +++ b/testgen/template/dbupgrade/0160_incremental_upgrade.sql @@ -0,0 +1,5 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +UPDATE connections + SET max_query_chars = 20000 + WHERE max_query_chars = 9000; diff --git a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql deleted file mode 100644 index b6268c52..00000000 --- a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql +++ /dev/null @@ -1,148 +0,0 @@ --- Create one record per CAT query: all test sets against one table, split over max chars -INSERT INTO working_agg_cat_tests - (test_run_id, - schema_name, table_name, cat_sequence, test_count, test_time, - column_names, test_types, test_definition_ids, - test_actions, test_descriptions, - test_parms, test_measures, test_conditions) - --- Column types from latest profile_results -WITH column_types AS ( - SELECT pr.table_groups_id, - pr.connection_id, - pr.schema_name, - pr.table_name, - pr.column_name, - pr.column_type - FROM profile_results pr - INNER JOIN ( - SELECT table_groups_id, - connection_id, - schema_name, - table_name, - column_name, - MAX(run_date) AS max_run_date - FROM profile_results - GROUP BY table_groups_id, connection_id, schema_name, table_name, column_name - ) latest - ON pr.table_groups_id = latest.table_groups_id - AND pr.schema_name = latest.schema_name - AND pr.table_name = latest.table_name - AND pr.column_name = latest.column_name - AND pr.run_date = latest.max_run_date -), - --- Test details from each test type -test_detail AS ( - SELECT t.test_suite_id, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, - '{RUN_DATE}'::TIMESTAMP as test_time, - t.column_name, t.test_type, t.id::VARCHAR as test_definition_id, - t.test_action, t.test_description, - - SUBSTRING( - CASE WHEN t.baseline_ct > '' THEN ', Baseline_Ct=' || t.baseline_ct ELSE '' END - || CASE WHEN t.baseline_unique_ct > '' THEN ', Baseline_Unique_Ct=' || t.baseline_unique_ct ELSE '' END - || CASE WHEN t.baseline_value > '' THEN ', Baseline_Value=' || t.baseline_value ELSE '' END - || CASE WHEN t.baseline_value_ct > '' THEN ', Baseline_Value_Ct=' || t.baseline_value_ct ELSE '' END - || CASE WHEN t.baseline_sum > '' THEN ', Baseline_Sum=' || t.baseline_sum ELSE '' END - || CASE WHEN t.baseline_avg > '' THEN ', Baseline_Avg=' || t.baseline_avg ELSE '' END - || CASE WHEN t.baseline_sd > '' THEN ', Baseline_SD=' || t.baseline_sd ELSE '' END - || CASE WHEN t.threshold_value > '' THEN ', Threshold_Value=' || t.threshold_value ELSE '' END, - 3, 999) || ' ' - as parms, - - -- Standard Measure start - 'CAST(' || - -- Nested parm replacements - part of query, not Python parms - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( - c.measure, - '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), - '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), - '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), - '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), - '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), - '{BASELINE_VALUE_CT}', COALESCE(t.baseline_value_ct, '') ), - '{BASELINE_SUM}', COALESCE(t.baseline_sum, '') ), - '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), - '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), - '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) - -- Standard measure end with pipe delimiter - || ' AS {VARCHAR_TYPE}) {CONCAT_OPERATOR} ''|'' ' as measure, - - -- Standard CASE for condition starts - 'CASE WHEN ' || - -- Nested parm replacements - standard - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( - c.measure || c.test_operator || c.test_condition, - '{COLUMN_NAME}', '{QUOTE}' || COALESCE(t.column_name, '') || '{QUOTE}'), - '{COLUMN_TYPE}', COALESCE(ct.column_type, '')), - '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), - '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), - '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), - '{BASELINE_VALUE_CT}', COALESCE(t.baseline_value_ct, '') ), - '{BASELINE_SUM}', COALESCE(t.baseline_sum, '') ), - '{BASELINE_AVG}', COALESCE(t.baseline_avg, '') ), - '{BASELINE_SD}', COALESCE(t.baseline_sd, '') ), - '{CUSTOM_QUERY}', COALESCE(t.custom_query, '')), - '{THRESHOLD_VALUE}', COALESCE(t.threshold_value, '')) - -- Standard case ends - || ' THEN ''0,'' ELSE ''1,'' END' as condition - FROM test_definitions t - INNER JOIN cat_test_conditions c - ON (t.test_type = c.test_type - AND '{SQL_FLAVOR}' = c.sql_flavor) - INNER JOIN test_suites s - ON t.test_suite_id = s.id - LEFT JOIN column_types ct - ON s.table_groups_id = ct.table_groups_id - AND t.schema_name = ct.schema_name - AND t.table_name = ct.table_name - AND t.column_name = ct.column_name - WHERE t.test_suite_id = '{TEST_SUITE_ID}' - AND t.schema_name = '{SCHEMA_NAME}' - AND t.table_name = '{TABLE_NAME}' - AND COALESCE(t.test_active, 'Y') = 'Y' - ), - -test_detail_split AS ( - SELECT test_suite_id, schema_name, table_name, test_time, - column_name, test_type, test_definition_id, test_action, test_description, - parms, measure, condition, - SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name - ORDER BY t.column_name ROWS UNBOUNDED PRECEDING ) as run_total_chars, - FLOOR( SUM(LENGTH(condition)) OVER (PARTITION BY t.schema_name, t.table_name - ORDER BY t.column_name ROWS UNBOUNDED PRECEDING ) - / {MAX_QUERY_CHARS} ) + 1 as query_split_no - FROM test_detail t -) - -SELECT '{TEST_RUN_ID}' as test_run_id, - d.schema_name, d.table_name, - d.query_split_no as cat_sequence, - COUNT(*) as test_count, - '{RUN_DATE}'::TIMESTAMP as test_time, - STRING_AGG(COALESCE(d.column_name, 'N/A'), '~|~' ORDER BY d.column_name) as column_names, - STRING_AGG(d.test_type, ',' ORDER BY d.column_name) as test_types, - STRING_AGG(d.test_definition_id, ',' ORDER BY d.column_name) as test_definition_ids, - -- Pipe delimiter below, because commas may be embedded - STRING_AGG(d.test_action, '|' ORDER BY d.column_name) as test_actions, - STRING_AGG(d.test_description, '|' ORDER BY d.column_name) as test_descriptions, - - -- Consolidated Parms - STRING_AGG( d.parms, '|' ORDER BY d.column_name) as parms, - - -- Consolidated Measures - -- Encode Null as text to decode when freeing kittens - STRING_AGG( 'COALESCE(' || d.measure || ',''' || '' || '|'')', - -- Use ++ as STRING_AGG delimiter -- replace with + later - '++' ORDER BY d.column_name) as measures, - - -- Consolidated CASE statements - STRING_AGG( d.condition, - -- Use ++ as STRING_AGG delimiter -- replace with + later - '++' ORDER BY d.column_name) as conditions - - FROM test_detail_split d -GROUP BY d.test_suite_id, d.schema_name, d.table_name, test_time, d.query_split_no; diff --git a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql b/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql deleted file mode 100644 index b5c79617..00000000 --- a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql +++ /dev/null @@ -1,11 +0,0 @@ -SELECT DISTINCT schema_name, - table_name - FROM test_definitions td - INNER JOIN test_types tt - ON td.test_type = tt.test_type - INNER JOIN table_groups tg - ON (td.table_groups_id = tg.id) - INNER JOIN connections c - ON (tg.connection_id = c.connection_id) - WHERE td.test_suite_id = :TEST_SUITE_ID - AND tt.run_type = 'CAT'; diff --git a/testgen/template/exec_cat_tests/ex_cat_results_parse.sql b/testgen/template/exec_cat_tests/ex_cat_results_parse.sql deleted file mode 100644 index 74f5dce5..00000000 --- a/testgen/template/exec_cat_tests/ex_cat_results_parse.sql +++ /dev/null @@ -1,68 +0,0 @@ --- Parses aggregated results and inserts into test_results table -WITH seq_digit AS ( - SELECT 0 as d UNION ALL - SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL - SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL - SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9 ), - seq_table_raw AS ( - SELECT CAST(a.d + (10 * b.d) + (100 * c.d) + (1000 * d.d) as INT) as nbr - FROM seq_digit a CROSS JOIN seq_digit b CROSS JOIN seq_digit c CROSS JOIN seq_digit d - ORDER BY nbr LIMIT 1000), - seq_table AS ( - SELECT nbr FROM seq_table_raw WHERE nbr > 0), - raw_results AS ( - SELECT t.test_run_id, t.schema_name, t.table_name, t.cat_sequence, t.test_count, - t.test_time, t.start_time, t.end_time, t.column_names, t.test_types, t.test_definition_ids, - t.test_actions, t.test_descriptions, - t.test_parms, t.test_measures, t.test_conditions, - r.measure_results, r.test_results - FROM working_agg_cat_tests t - INNER JOIN working_agg_cat_results r - ON (t.test_run_id = r.test_run_id - AND t.schema_name = r.schema_name - AND t.table_name = r.table_name - AND t.cat_sequence = r.cat_sequence) - WHERE t.test_run_id = :TEST_RUN_ID - AND t.column_names > '' - ), - parsed_results AS ( - SELECT t.schema_name, - t.table_name, - t.test_time, - t.start_time, - t.end_time, - nbr AS test_number, - SPLIT_PART(t.test_actions, '|,', s.nbr) AS test_action, - SPLIT_PART(t.test_descriptions, '|', s.nbr) AS test_description, - SPLIT_PART(t.column_names, '~|~', s.nbr) AS column_name, - SPLIT_PART(t.test_types, ',', s.nbr) AS test_type, - SPLIT_PART(t.test_definition_ids, ',', s.nbr) AS test_definition_id, - SPLIT_PART(t.test_parms, '|', s.nbr) AS test_parms, - SPLIT_PART(t.test_measures, '++', s.nbr) AS measure, - TRIM(SPLIT_PART(t.test_conditions, '++', s.nbr)) AS condition, - -- Restore encoded null value - NULLIF(SPLIT_PART(t.measure_results, '|', s.nbr), '') AS measure_result, - SPLIT_PART(t.test_results, ',', s.nbr) AS test_result - FROM raw_results t - CROSS JOIN seq_table s - ) -INSERT INTO test_results - (test_run_id, test_type, test_definition_id, test_suite_id, - test_time, starttime, endtime, schema_name, table_name, column_names, - skip_errors, input_parameters, result_code, - result_measure, test_action, subset_condition, result_query, test_description) -SELECT :TEST_RUN_ID as test_run_id, - r.test_type, r.test_definition_id::UUID, :TEST_SUITE_ID, r.test_time, r.start_time, r.end_time, - r.schema_name, r.table_name, r.column_name, - 0 as skip_errors, - r.test_parms as input_parameters, - r.test_result::INT as result_code, - r.measure_result as result_measure, - r.test_action, NULL as subset_condition, - 'SELECT ' || LEFT(REPLACE(r.condition, '{RUN_' || 'DATE}', :RUN_DATE), LENGTH(REPLACE(r.condition, '{RUN_' || 'DATE}', :RUN_DATE - )) - LENGTH(' THEN ''0,'' ELSE ''1,'' END')) || ' THEN 0 ELSE 1 END' - || ' FROM ' || r.schema_name || '.' || r.table_name as result_query, - COALESCE(r.test_description, c.test_description) as test_description - FROM parsed_results r - INNER JOIN test_types c - ON r.test_type = c.test_type; diff --git a/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql b/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql deleted file mode 100644 index 7632fdb5..00000000 --- a/testgen/template/exec_cat_tests/ex_cat_retrieve_agg_test_parms.sql +++ /dev/null @@ -1,8 +0,0 @@ -SELECT schema_name, - table_name, - cat_sequence, - -- Replace list delimiters with concat operator - REPLACE(test_measures, '++', :CONCAT_OPERATOR) as test_measures, - REPLACE(test_conditions, '++', :CONCAT_OPERATOR) as test_conditions - FROM working_agg_cat_tests - WHERE test_run_id = :TEST_RUN_ID; diff --git a/testgen/template/exec_cat_tests/ex_cat_test_query.sql b/testgen/template/exec_cat_tests/ex_cat_test_query.sql deleted file mode 100644 index 3013780c..00000000 --- a/testgen/template/exec_cat_tests/ex_cat_test_query.sql +++ /dev/null @@ -1,7 +0,0 @@ -SELECT '{TEST_RUN_ID}' as test_run_id, - '{SCHEMA_NAME}' as schema_name, - '{TABLE_NAME}' as table_name, - '{CAT_SEQUENCE}' as cat_sequence, - {TEST_MEASURES} as measure_results, - {TEST_CONDITIONS} as test_results - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/execution/disable_invalid_test_definitions.sql b/testgen/template/execution/disable_invalid_test_definitions.sql new file mode 100644 index 00000000..37ed5b11 --- /dev/null +++ b/testgen/template/execution/disable_invalid_test_definitions.sql @@ -0,0 +1,7 @@ +UPDATE test_definitions td +SET test_active = 'N', + test_definition_status = LEFT('Deactivated ' || :RUN_DATE || '.' || SUBSTRING(tr.result_message, 13), 200) +FROM test_results tr +WHERE td.id = tr.test_definition_id + AND tr.test_run_id = :TEST_RUN_ID + AND tr.result_status = 'Error'; diff --git a/testgen/template/execution/ex_get_tests_non_cat.sql b/testgen/template/execution/ex_get_tests_non_cat.sql deleted file mode 100644 index 69672e1b..00000000 --- a/testgen/template/execution/ex_get_tests_non_cat.sql +++ /dev/null @@ -1,51 +0,0 @@ -SELECT tt.test_type, - td.id::VARCHAR AS test_definition_id, - COALESCE(td.test_description, tt.test_description) AS test_description, - COALESCE(td.test_action, ts.test_action, '') AS test_action, - schema_name, - table_name, - column_name, - cast(coalesce(skip_errors, 0) as varchar(50)) as skip_errors, - coalesce(baseline_ct, '') as baseline_ct, - coalesce(baseline_unique_ct, '') as baseline_unique_ct, - coalesce(baseline_value, '') as baseline_value, - coalesce(baseline_value_ct, '') as baseline_value_ct, - coalesce(threshold_value, '') as threshold_value, - coalesce(baseline_sum, '') as baseline_sum, - coalesce(baseline_avg, '') as baseline_avg, - coalesce(baseline_sd, '') as baseline_sd, - coalesce(lower_tolerance, '') as lower_tolerance, - coalesce(upper_tolerance, '') as upper_tolerance, - case - when nullif(subset_condition, '') is null then '1=1' - else subset_condition end as subset_condition, - coalesce(groupby_names, '') as groupby_names, - case - when having_condition is null then '' - else concat('HAVING ', having_condition) end as having_condition, - coalesce(window_date_column, '') as window_date_column, - cast(coalesce(window_days, '0') as varchar(50)) as window_days, - coalesce(match_schema_name, '') as match_schema_name, - coalesce(match_table_name, '') as match_table_name, - coalesce(match_column_names, '') as match_column_names, - case - when nullif(match_subset_condition, '') is null then '1=1' - else match_subset_condition end as match_subset_condition, - coalesce(match_groupby_names, '') as match_groupby_names, - case - when match_having_condition is null then '' - else concat('HAVING ', match_having_condition) - END as match_having_condition, - coalesce(custom_query, '') as custom_query, - coalesce(tm.template_name, '') as template_name -FROM test_definitions td - INNER JOIN test_suites ts - ON (td.test_suite_id = ts.id) - INNER JOIN test_types tt - ON (td.test_type = tt.test_type) - LEFT JOIN test_templates tm - ON (td.test_type = tm.test_type - AND :SQL_FLAVOR = tm.sql_flavor) -WHERE td.test_suite_id = :TEST_SUITE_ID - AND tt.run_type = 'QUERY' - AND td.test_active = 'Y'; diff --git a/testgen/template/execution/ex_update_test_suite.sql b/testgen/template/execution/ex_update_test_suite.sql deleted file mode 100644 index 72505590..00000000 --- a/testgen/template/execution/ex_update_test_suite.sql +++ /dev/null @@ -1,13 +0,0 @@ -WITH last_run - AS (SELECT test_suite_id, MAX(test_starttime) as max_starttime - FROM test_runs - WHERE test_suite_id = :TEST_SUITE_ID - AND status = 'Complete' - GROUP BY test_suite_id) -UPDATE test_suites - SET last_complete_test_run_id = r.id - FROM test_runs r -INNER JOIN last_run l - ON (r.test_suite_id = l.test_suite_id - AND r.test_starttime = l.max_starttime) - WHERE test_suites.id = r.test_suite_id; \ No newline at end of file diff --git a/testgen/template/execution/get_active_test_definitions.sql b/testgen/template/execution/get_active_test_definitions.sql new file mode 100644 index 00000000..f59b670c --- /dev/null +++ b/testgen/template/execution/get_active_test_definitions.sql @@ -0,0 +1,46 @@ +SELECT td.id, + td.test_type, + schema_name, + table_name, + column_name, + skip_errors, + baseline_ct, + baseline_unique_ct, + baseline_value, + baseline_value_ct, + threshold_value, + baseline_sum, + baseline_avg, + baseline_sd, + lower_tolerance, + upper_tolerance, + subset_condition, + groupby_names, + having_condition, + window_date_column, + window_days, + match_schema_name, + match_table_name, + match_column_names, + match_subset_condition, + match_groupby_names, + match_having_condition, + custom_query, + tt.run_type, + tt.test_scope, + tm.template_name, + c.measure, + c.test_operator, + c.test_condition +FROM test_definitions td + LEFT JOIN test_types tt ON (td.test_type = tt.test_type) + LEFT JOIN test_templates tm ON ( + td.test_type = tm.test_type + AND :SQL_FLAVOR = tm.sql_flavor + ) + LEFT JOIN cat_test_conditions c ON ( + td.test_type = c.test_type + AND :SQL_FLAVOR = c.sql_flavor + ) +WHERE td.test_suite_id = :TEST_SUITE_ID + AND td.test_active = 'Y'; \ No newline at end of file diff --git a/testgen/template/execution/ex_update_history_threshold_last_n.sql b/testgen/template/execution/update_historic_thresholds.sql similarity index 95% rename from testgen/template/execution/ex_update_history_threshold_last_n.sql rename to testgen/template/execution/update_historic_thresholds.sql index b8b9d532..51d4340a 100644 --- a/testgen/template/execution/ex_update_history_threshold_last_n.sql +++ b/testgen/template/execution/update_historic_thresholds.sql @@ -16,7 +16,7 @@ WITH stats AS ( ORDER BY tr.test_time DESC LIMIT CASE WHEN d.history_calculation = 'Value' THEN 1 ELSE d.history_lookback END ) AS r ON TRUE - WHERE d.test_suite_id = '{TEST_SUITE_ID}' + WHERE d.test_suite_id = :TEST_SUITE_ID AND d.test_active = 'Y' AND d.history_lookback IS NOT NULL GROUP BY d.id, d.history_calculation, d.history_lookback diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/update_test_results.sql similarity index 100% rename from testgen/template/execution/ex_finalize_test_run_results.sql rename to testgen/template/execution/update_test_results.sql diff --git a/testgen/template/execution/ex_update_test_record_in_testrun_table.sql b/testgen/template/execution/update_test_run_stats.sql similarity index 55% rename from testgen/template/execution/ex_update_test_record_in_testrun_table.sql rename to testgen/template/execution/update_test_run_stats.sql index 53137157..15dab138 100644 --- a/testgen/template/execution/ex_update_test_record_in_testrun_table.sql +++ b/testgen/template/execution/update_test_run_stats.sql @@ -1,22 +1,18 @@ WITH stats AS ( SELECT r.id as test_run_id, - COALESCE(COUNT(tr.id) , 0) AS test_ct, - SUM(result_code) AS passed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) AS failed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) AS warning_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Log' THEN 1 END), 0) AS log_ct, - COALESCE(SUM(CASE WHEN tr.result_message ILIKE 'ERROR%' THEN 1 ELSE 0 END), 0) AS error_ct + COALESCE(COUNT(tr.id), 0) AS test_ct, + SUM(result_code) AS passed_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) AS failed_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) AS warning_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Log' THEN 1 END), 0) AS log_ct, + COALESCE(SUM(CASE WHEN tr.result_status = 'Error' THEN 1 ELSE 0 END), 0) AS error_ct FROM test_runs r INNER JOIN test_results tr ON r.id = tr.test_run_id WHERE r.id = :TEST_RUN_ID GROUP BY r.id ) UPDATE test_runs - SET status = CASE WHEN length(:EXCEPTION_MESSAGE) = 0 then 'Complete' else 'Error' end, - test_endtime = :NOW_TIMESTAMP, - log_message = :EXCEPTION_MESSAGE, - duration = TO_CHAR(:NOW_TIMESTAMP - r.test_starttime, 'HH24:MI:SS'), - test_ct = s.test_ct, + SET test_ct = s.test_ct, passed_ct = s.passed_ct, failed_ct = s.failed_ct, warning_ct = s.warning_ct, diff --git a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql b/testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql similarity index 85% rename from testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql rename to testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql index 1e3c93ae..ee2165d3 100644 --- a/testgen/template/flavors/bigquery/data_chars/schema_ddf_query_bigquery.sql +++ b/testgen/template/flavors/bigquery/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP() AS refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -10,7 +9,6 @@ SELECT '{PROJECT_CODE}' AS project_code, ELSE LOWER(c.data_type) END AS column_type, c.data_type AS db_data_type, - NULL AS character_maximum_length, c.ordinal_position, CASE WHEN LOWER(c.data_type) = 'string' THEN 'A' @@ -21,7 +19,9 @@ SELECT '{PROJECT_CODE}' AS project_code, WHEN REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') THEN 'N' ELSE 'X' END AS general_type, - REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal + REGEXP_CONTAINS(LOWER(c.data_type), r'(decimal|numeric|bignumeric)') AS is_decimal, + t.row_count AS approx_record_ct FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` c + LEFT JOIN `{DATA_SCHEMA}.__TABLES__` t ON c.table_name = t.table_id WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql index 03ccee36..374de512 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_data_match_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -26,9 +24,7 @@ SELECT '{TEST_TYPE}' AS test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql index 0aee6ead..780538e4 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_relative_entropy_bigquery.sql @@ -32,8 +32,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' AS test_type, NULL as result_signal, CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END AS result_code, CONCAT('Divergence Level: ', CAST(js_divergence AS STRING), ', Threshold: {THRESHOLD_VALUE}.') AS result_message, - js_divergence AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + js_divergence AS result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) AS js_divergence diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql index 70d97b32..87365dc5 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_table_changed_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -20,9 +18,7 @@ SELECT '{TEST_TYPE}' AS test_type, END AS result_message, CASE WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + END AS result_measure FROM ( SELECT {CUSTOM_QUERY} AS fingerprint FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql index 5ba04cfd..4e47eaff 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_no_drops_bigquery.sql @@ -4,8 +4,6 @@ SELECT '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -25,9 +23,7 @@ SELECT ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` diff --git a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql index c16c158e..9b051977 100644 --- a/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql +++ b/testgen/template/flavors/bigquery/exec_query_tests/ex_window_match_same_bigquery.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' AS test_type, '{TEST_SUITE_ID}' AS test_suite_id, '{TEST_RUN_ID}' AS test_run_id, '{RUN_DATE}' AS test_time, - '{START_TIME}' AS starttime, - CURRENT_TIMESTAMP AS endtime, '{SCHEMA_NAME}' AS schema_name, '{TABLE_NAME}' AS table_name, '{COLUMN_NAME_NO_QUOTES}' AS column_names, @@ -26,13 +24,11 @@ SELECT '{TEST_TYPE}' AS test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) AS result_measure, - '{SUBSET_DISPLAY}' AS subset_condition, - NULL AS result_query + COUNT(*) AS result_measure FROM ( -- Values in the prior timeframe but not in the latest ( - SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -40,7 +36,7 @@ FROM ( INTERVAL -{WINDOW_DAYS} DAY ) EXCEPT DISTINCT - SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Prior Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -55,7 +51,7 @@ FROM ( UNION ALL -- Values in the latest timeframe but not in the prior ( - SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( @@ -67,7 +63,7 @@ FROM ( INTERVAL -{WINDOW_DAYS} DAY ) EXCEPT DISTINCT - SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME} + SELECT 'Latest Timeframe' AS missing_from, {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= DATE_ADD( diff --git a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql index da6811be..23c60db8 100644 --- a/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/bigquery/gen_query_tests/gen_table_changed_test.sql @@ -124,7 +124,14 @@ newtests AS ( WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(COUNT(DISTINCT @@@) AS STRING) || "|" || CAST(SUM(LENGTH(@@@)) AS STRING)' WHEN general_type = 'N' THEN - 'CAST(MIN(@@@) AS STRING) || "|" || CAST(MAX(@@@) AS STRING) || "|" || CAST(SUM(@@@) AS STRING) || "|" || CAST(ROUND(AVG(@@@), 5) AS STRING) || "|" || CAST(ROUND(STDDEV(CAST(@@@ AS FLOAT64)), 5) AS STRING)' + 'ARRAY_TO_STRING([ + CAST(COUNT(@@@) AS STRING), + CAST(COUNT(DISTINCT MOD(CAST(COALESCE(@@@,0) AS NUMERIC) * 1000000, CAST(1000003 AS NUMERIC))) AS STRING), + COALESCE(CAST(ROUND(MIN(CAST(@@@ AS NUMERIC)), 6) AS STRING), ''''), + COALESCE(CAST(ROUND(MAX(CAST(@@@ AS NUMERIC)), 6) AS STRING), ''''), + CAST(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) AS NUMERIC) * 1000000, CAST(1000000007 AS NUMERIC))), CAST(0 AS NUMERIC)), CAST(1000000007 AS NUMERIC)) AS STRING), + CAST(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) AS NUMERIC) * 1000000, CAST(1000000009 AS NUMERIC))), CAST(0 AS NUMERIC)), CAST(1000000009 AS NUMERIC)) AS STRING) + ], ''|'', '''')' END, '@@@', '`' || column_name || '`'), ' || "|" || ' diff --git a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql deleted file mode 100644 index 4fdfcc6e..00000000 --- a/testgen/template/flavors/bigquery/profiling/project_get_table_sample_count_bigquery.sql +++ /dev/null @@ -1,30 +0,0 @@ -WITH stats AS ( - SELECT - COUNT(*) * 1.0 AS record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} AS FLOAT64) * COUNT(*) * 1.0 / 100.0) AS calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} AS FLOAT64) AS min_sample_ct, - CAST(999000 AS FLOAT64) AS max_sample_ct - FROM `{SAMPLING_TABLE}` -) -SELECT '{SAMPLING_TABLE}' AS schema_table, - CASE - WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END AS sample_count, - CASE - WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END AS sample_ratio, - ROUND( - CASE - WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, - 4) AS sample_percent_calc -FROM stats; diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml similarity index 82% rename from testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml rename to testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml index 5d0456af..0a9c6350 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query_bigquery.yaml +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml @@ -1,15 +1,15 @@ --- -strTemplate01_sampling: | +01_sampling: | WITH target_table AS ( SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC} ) SELECT -strTemplate01_else: | +01_else: | WITH target_table AS ( SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` ) SELECT -strTemplate01_5: | +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -22,29 +22,33 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, + +03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, MAX(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS max_length, AVG(NULLIF(LENGTH(CAST(`{COL_NAME}` AS STRING)), 0)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM( + +04_A: SUM( CASE WHEN REGEXP_CONTAINS(TRIM(CAST(`{COL_NAME}` AS STRING)), r'^0(\.0*)?$') THEN 1 ELSE 0 END ) AS zero_value_ct, -strTemplate04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: | +04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: | COUNT( DISTINCT UPPER( REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r"[ '\.,-]", "") @@ -115,7 +119,7 @@ strTemplate05_A: | AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666' THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN' END AS std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -130,7 +134,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: | + +06_A: | ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_patterns FROM ( @@ -150,7 +155,7 @@ strTemplate06_A_patterns: | AND ( SELECT MAX(LENGTH(CAST({COL_NAME} AS STRING))) FROM `target_table` - ) BETWEEN 3 AND {PARM_MAX_PATTERN_LENGTH} + ) BETWEEN 3 AND {MAX_PATTERN_LENGTH} ) p GROUP BY pattern HAVING pattern > ' ' @@ -159,27 +164,9 @@ strTemplate06_A_patterns: | ) ) ps ) as top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: | - ( - SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_freq_values - FROM ( - SELECT CONCAT(CAST(ct AS STRING), ' | ', CAST({COL_NAME} AS STRING)) AS val, - ct - FROM ( - SELECT {COL_NAME}, - COUNT(*) AS ct - FROM `target_table` - WHERE {COL_NAME} > ' ' - GROUP BY {COL_NAME} - HAVING {COL_NAME} > ' ' - ORDER BY ct DESC, {COL_NAME} DESC - LIMIT 10 - ) - ) ps - ) as top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN(`{COL_NAME}`) AS min_value, MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT64)) AS avg_value, @@ -187,7 +174,7 @@ strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, MIN(pct_25) AS percentile_25, MIN(pct_50) AS percentile_50, MIN(pct_75) AS percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -195,9 +182,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, -strTemplate11_D: | + +10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, +10_else: NULL as fractional_sum, + +11_D: | MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same MAX(`{COL_NAME}`) as max_date, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 12 THEN 1 END) AS before_1yr_date_ct, @@ -211,8 +200,7 @@ strTemplate11_D: | COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -225,10 +213,11 @@ strTemplate11_else: NULL as min_date, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: | + +12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, + +14_A: | ( SELECT COUNT(DISTINCT REGEXP_REPLACE( @@ -243,31 +232,26 @@ strTemplate14_A_do_patterns: | ) as distinct_pattern_ct, SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, - AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " -strTemplate98_sampling: ' FROM target_table' -strTemplate98_else: ' FROM target_table' -strTemplate99_N: | +16_all: " '{PROFILE_RUN_ID}' as profile_run_id " + +98_all: ' FROM target_table' + +99_N: | , (SELECT PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25, PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50, PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(25)] AS pct_25, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(50)] AS pct_50, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -strTemplate99_else: ; -strTemplate100_sampling: ' ' +99_else: ; diff --git a/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql b/testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query.sql similarity index 100% rename from testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query_bigquery.sql rename to testgen/template/flavors/bigquery/profiling/project_secondary_profiling_query.sql diff --git a/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index 8a465da2..00000000 --- a/testgen/template/flavors/bigquery/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns -from `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` -where table_schema in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql new file mode 100644 index 00000000..a05b333a --- /dev/null +++ b/testgen/template/flavors/bigquery/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT table_schema AS schema_name, + table_name, + column_name +FROM `{DATA_SCHEMA}.INFORMATION_SCHEMA.COLUMNS` +WHERE table_schema IN ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql b/testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql similarity index 86% rename from testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql rename to testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql index 0cfb56f6..6ae63a94 100644 --- a/testgen/template/flavors/databricks/data_chars/schema_ddf_query_databricks.sql +++ b/testgen/template/flavors/databricks/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP AS refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -11,7 +10,6 @@ SELECT '{PROJECT_CODE}' AS project_code, ELSE lower(c.full_data_type) END AS column_type, c.full_data_type AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type IN ('STRING', 'CHAR') THEN 'A' @@ -23,7 +21,8 @@ SELECT '{PROJECT_CODE}' AS project_code, CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 - END AS is_decimal + END AS is_decimal, + NULL AS approx_record_ct -- table statistics unavailable FROM information_schema.columns c WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql index 6e5184d6..fc354f45 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_no_drops_databricks.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` diff --git a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql index 7a078dc7..a30768b1 100644 --- a/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql +++ b/testgen/template/flavors/databricks/exec_query_tests/ex_window_match_same_databricks.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql index 1c6521bc..17e085da 100644 --- a/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/databricks/gen_query_tests/gen_table_changed_test.sql @@ -121,9 +121,16 @@ newtests CASE WHEN general_type = 'D' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING' WHEN general_type = 'A' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING || ''|'' || SUM(LENGTH(@@@))::STRING' - WHEN general_type = 'N' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || SUM(@@@)::STRING || ''|'' || ROUND(AVG(@@@), 5)::STRING || ''|'' || ROUND(STDDEV(@@@::FLOAT), 5)::STRING' + WHEN general_type = 'N' THEN 'CONCAT_WS(''|'', + COUNT(@@@)::STRING, + COUNT(DISTINCT MOD((COALESCE(@@@,0)::DECIMAL(38,6) * 1000000)::DECIMAL(38,0), 1000003))::STRING, + COALESCE((MIN(@@@)::DECIMAL(38,6))::STRING, ''''), + COALESCE((MAX(@@@)::DECIMAL(38,6))::STRING, ''''), + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000007)), 0), 1000000007)::STRING, ''''), + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000009)), 0), 1000000009)::STRING, '''') + )' END, - '@@@', '"' || column_name || '"'), + '@@@', '`' || column_name || '`'), ' || ''|'' || ' ORDER BY element_type, fingerprint_order, column_name) as fingerprint FROM combined diff --git a/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql b/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql deleted file mode 100644 index 9a62c3d6..00000000 --- a/testgen/template/flavors/databricks/profiling/project_get_table_sample_count_databricks.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml similarity index 82% rename from testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml rename to testgen/template/flavors/databricks/profiling/project_profiling_query.yaml index 4c2cbaa4..2fc9350d 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query_databricks.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +22,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LEN(`{COL_NAME}`)) AS min_length, + +03_ADN: MIN(LEN(`{COL_NAME}`)) AS min_length, MAX(LEN(`{COL_NAME}`)) AS max_length, AVG(CAST(NULLIF(LEN(`{COL_NAME}`), 0) AS FLOAT)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN LTRIM(RTRIM(`{COL_NAME}`)) RLIKE '0([.]0*)' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 ELSE 0 @@ -118,7 +130,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-', AND LEFT(`{COL_NAME}`, 3) NOT BETWEEN '734' AND '749' AND LEFT(`{COL_NAME}`, 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -133,7 +145,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) + +06_A: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) FROM ( SELECT TRANSLATE( @@ -141,32 +154,20 @@ strTemplate06_A_patterns: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) ) AS pattern, COUNT(*) AS ct, ct || ' | ' || pattern AS ct_pattern - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + FROM target_table WHERE trim(`{COL_NAME}`) != '' AND ( - (SELECT MAX(LEN(`{COL_NAME}`)) FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) BETWEEN 3 and 25 + (SELECT MAX(LEN(`{COL_NAME}`)) FROM target_table) BETWEEN 3 and 25 ) GROUP BY pattern HAVING len(pattern) > 0 ORDER BY ct DESC LIMIT 5 )) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(CONCAT_WS(' | ', collect_list(val)), 1000) as concat_vals - FROM ( - SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || `{COL_NAME}` as val, - COUNT(*) as ct - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` - WHERE `{COL_NAME}` > ' ' - GROUP BY `{COL_NAME}` - HAVING `{COL_NAME}` > ' ' - ORDER BY COUNT(*) DESC, val ASC - LIMIT 10 - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN(`{COL_NAME}`) AS min_value, MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT)) AS avg_value, @@ -174,7 +175,7 @@ strTemplate08_N: MIN(`{COL_NAME}`) AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -182,11 +183,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN(`{COL_NAME}`) IS NULL THEN NULL ELSE CASE WHEN MIN(`{COL_NAME}`) >= CAST('0001-01-01' as date) THEN MIN(`{COL_NAME}`) ELSE CAST('0001-01-01' as date) END END as min_date, @@ -225,8 +226,7 @@ strTemplate11_D: CASE COUNT(DISTINCT <%DATEDIFF_DAY; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_days_present, COUNT(DISTINCT <%DATEDIFF_WEEK; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_weeks_present, COUNT(DISTINCT <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -240,52 +240,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, +14_A: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` + FROM target_table WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LEN(RTRIM(LTRIM(`{COL_NAME}`))) - LEN(REPLACE(RTRIM(LTRIM(`{COL_NAME}`)),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(RTRIM(LTRIM(`{COL_NAME}`))) - LEN(REPLACE(RTRIM(LTRIM(`{COL_NAME}`)),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" - -strTemplate98_sampling: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT)' +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_else: ' FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`' +98_all: ' FROM target_table' -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +99_else: ' ' diff --git a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql similarity index 88% rename from testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql rename to testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql index 7def8c78..c3bb4097 100644 --- a/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query_databricks.sql +++ b/testgen/template/flavors/databricks/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,17 @@ --- Get Freqs for selected columns -WITH ranked_vals +WITH target_table AS - (SELECT `{COL_NAME}`, - COUNT(*) AS ct, - ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn + (SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` -- TG-IF do_sample_bool TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) -- TG-ENDIF + ), +-- Get Freqs for selected columns +ranked_vals +AS (SELECT `{COL_NAME}`, + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn + FROM target_table WHERE `{COL_NAME}` > ' ' GROUP BY `{COL_NAME}` ), @@ -32,7 +36,6 @@ SELECT '{PROJECT_CODE}' as project_code, (left, right) -> CASE WHEN CAST(SPLIT(left, '\\|')[0] AS INT) < CAST(SPLIT(right, '\\|')[0] AS INT) THEN -1 ELSE 1 END )), '^#^', '\n') AS top_freq_values, (SELECT MD5(CONCAT_WS('|', ARRAY_SORT(COLLECT_LIST(NULLIF(dist_col_name,''))))) as dvh - FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}`) a + FROM (SELECT DISTINCT `{COL_NAME}` as dist_col_name FROM target_table) a ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql index 098da4d4..7e8d3fff 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql index fe60101f..accad515 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql index 89845709..e183241f 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_range_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql index 3fb69cc8..e5dbfbf8 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL FROM ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql index 19d0c515..0d17c0fc 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, CASE @@ -31,9 +29,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - NULL as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( {CUSTOM_QUERY} ) TEST; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql index 838ea5c0..52dd918d 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT {GROUPBY_NAMES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql index 0c0c0b19..f7758fa1 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql index 61137108..b194bde3 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_dupe_rows_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COALESCE(SUM(record_ct), 0) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COALESCE(SUM(record_ct), 0) as result_measure FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql deleted file mode 100644 index fb717344..00000000 --- a/testgen/template/flavors/generic/exec_query_tests/ex_prior_match_generic.sql +++ /dev/null @@ -1,38 +0,0 @@ -SELECT '{TEST_TYPE}' as test_type, - '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE_ID}' as test_suite_id, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime,CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, - {SKIP_ERRORS} as skip_errors, - 'schema_name = {SCHEMA_NAME}, prior_schema = {MATCH_SCHEMA_NAME}, table_name = {TABLE_NAME}, column_name = {COLUMN_NAME}, subset_condition = {SUBSET_CONDITION}, mode = {MODE}' - as input_parameters, - NULL as result_signal, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched measures: ', CAST( COALESCE(COUNT(*), 0) AS {VARCHAR_TYPE}) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS {VARCHAR_TYPE}), '.') - ) - ) AS result_message, - COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description - FROM ( - ( SELECT {COLUMN_NAME} - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} - EXCEPT - SELECT {COLUMN_NAME} - FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} ) - UNION -( SELECT {COLUMN_NAME} - FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} - EXCEPT - SELECT {COLUMN_NAME} - FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} - WHERE {SUBSET_CONDITION} ) -); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql index 84be7315..6f30c530 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql @@ -30,8 +30,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' as test_type, CONCAT('Divergence Level: ', CONCAT(CAST(js_divergence AS {VARCHAR_TYPE}), ', Threshold: {THRESHOLD_VALUE}.')) as result_message, - js_divergence as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + js_divergence as result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql index bf573f78..672f19d6 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_table_changed_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -24,9 +22,7 @@ SELECT '{TEST_TYPE}' as test_type, WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + END as result_measure FROM ( SELECT {CUSTOM_QUERY} as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql index 81d0784e..7ece651a 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql index 3bb2e84b..9b463d7c 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index eacffa61..00000000 --- a/testgen/template/flavors/generic/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(table_schema, '.'), concat(table_name, '.')), column_name) as columns -from information_schema.columns -where table_schema in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql new file mode 100644 index 00000000..dba356de --- /dev/null +++ b/testgen/template/flavors/generic/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT table_schema AS schema_name, + table_name, + column_name +FROM information_schema.columns +WHERE table_schema IN ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql b/testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql similarity index 64% rename from testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql rename to testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql index 8b113f7c..8a3ea74f 100644 --- a/testgen/template/flavors/mssql/data_chars/schema_ddf_query_mssql.sql +++ b/testgen/template/flavors/mssql/data_chars/get_schema_ddf.sql @@ -1,6 +1,15 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP as refresh_timestamp, - c.table_schema, +WITH approx_cts AS ( + SELECT SCHEMA_NAME(o.schema_id) AS schema_name, + o.name AS table_name, + SUM(p.rows) AS approx_record_ct + FROM sys.objects o + LEFT JOIN sys.partitions p ON p.object_id = o.object_id + WHERE p.index_id IN (0, 1) -- 0 = heap, 1 = clustered index + OR p.index_id IS NULL + GROUP BY o.schema_id, o.name +) +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -14,14 +23,13 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE c.data_type END AS column_type, CASE WHEN c.data_type LIKE '%char' OR c.data_type LIKE '%binary' - THEN c.data_type + '(' + CAST(c.character_maximum_length AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.character_maximum_length AS VARCHAR) + ')', '') WHEN c.data_type IN ('datetime2', 'datetimeoffset', 'time') - THEN c.data_type + '(' + CAST(c.datetime_precision AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.datetime_precision AS VARCHAR) + ')', '') WHEN c.data_type IN ('numeric', 'decimal') - THEN c.data_type + '(' + CAST(c.numeric_precision AS VARCHAR) + ',' - + CAST(c.numeric_scale AS VARCHAR) + ')' + THEN c.data_type + COALESCE('(' + CAST(c.numeric_precision AS VARCHAR) + ',' + + CAST(c.numeric_scale AS VARCHAR) + ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN LOWER(c.data_type) LIKE '%char%' @@ -40,7 +48,9 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE 'X' END AS general_type, - CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 END AS is_decimal + CASE WHEN c.numeric_scale > 0 THEN 1 ELSE 0 END AS is_decimal, + a.approx_record_ct AS approx_record_ct FROM information_schema.columns c + LEFT JOIN approx_cts a ON c.table_schema = a.schema_name AND c.table_name = a.table_name WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql index 7b26cbab..4ec91d25 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql @@ -30,8 +30,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -44,9 +42,7 @@ SELECT '{TEST_TYPE}' as test_type, CONCAT('Divergence Level: ', CONCAT(CAST(js_divergence AS VARCHAR), ', Threshold: {THRESHOLD_VALUE}.')) as result_message, - js_divergence as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + js_divergence as result_measure FROM ( SELECT 0.5 * ABS(SUM(new_pct * LOG(new_pct/avg_pct)/LOG(2))) + 0.5 * ABS(SUM(old_pct * LOG(old_pct/avg_pct)/LOG(2))) as js_divergence diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql index 978a46dd..b448fe84 100644 --- a/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_table_changed_mssql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -24,9 +22,7 @@ SELECT '{TEST_TYPE}' as test_type, WHEN fingerprint = '{BASELINE_VALUE}' THEN 0 ELSE 1 - END as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + END as result_measure FROM ( SELECT {CUSTOM_QUERY} as fingerprint FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WITH (NOLOCK) WHERE {SUBSET_CONDITION} diff --git a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql index 3f8be00e..d352848e 100644 --- a/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/flavors/mssql/gen_query_tests/gen_table_changed_test.sql @@ -14,8 +14,9 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date AND ts.id = '{TEST_SUITE_ID}' AND p.run_date::DATE <= '{AS_OF_DATE}' GROUP BY r.table_groups_id), -curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, - distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct +curprof AS (SELECT p.profile_run_id, p.schema_name, p.table_name, p.column_name, p.functional_data_type, + p.general_type, p.distinct_value_ct, p.record_ct, p.max_value, p.min_value, + p.avg_value, p.stdev_value, p.null_value_ct FROM last_run lr INNER JOIN profile_results p ON (lr.table_groups_id = p.table_groups_id @@ -28,7 +29,7 @@ locked AS (SELECT schema_name, table_name AND lock_refresh = 'Y'), -- IDs - TOP 2 id_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -42,7 +43,7 @@ id_cols AND functional_data_type ILIKE 'ID%'), -- Process Date - TOP 1 process_date_cols - AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -57,7 +58,7 @@ process_date_cols AND functional_data_type ILIKE 'process%'), -- Transaction Date - TOP 1 tran_date_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, distinct_value_ct, ROW_NUMBER() OVER (PARTITION BY schema_name, table_name ORDER BY @@ -70,9 +71,9 @@ tran_date_cols -- Numeric Measures numeric_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, column_type, + AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, /* - -- Subscores + -- Subscores -- save for reference distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score, LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score, @@ -98,19 +99,19 @@ numeric_cols_ranked FROM numeric_cols WHERE change_detection_score IS NOT NULL), combined - AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, column_type, 10 + rank AS fingerprint_order + AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order FROM id_cols WHERE rank <= 2 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, column_type, 20 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order FROM process_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, column_type, 30 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order FROM tran_date_cols WHERE rank = 1 UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, column_type, 40 + rank AS fingerprint_order + SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order FROM numeric_cols_ranked WHERE rank = 1 ), newtests AS ( @@ -121,10 +122,16 @@ newtests AS ( 'CAST(COUNT(*) AS varchar) + ''|'' + ' || STRING_AGG( REPLACE( CASE - WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR)' - WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS NVARCHAR) + ''|'' + CAST(SUM(LEN(@@@)) AS NVARCHAR)' - WHEN general_type = 'N' AND column_type ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(CAST(@@@ AS BIGINT)) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(CAST(@@@ AS DECIMAL(30,5))), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' - WHEN general_type = 'N' AND column_type NOT ILIKE '%int%' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + MAX(CAST(@@@ AS NVARCHAR)) + ''|'' + CAST(SUM(@@@) AS NVARCHAR) + ''|'' + CAST(ROUND(AVG(@@@), 5) AS NVARCHAR) + ''|'' + CAST(ROUND(STDEV(CAST(@@@ AS FLOAT)), 5) AS NVARCHAR)' + WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + CAST(MAX(@@@) AS NVARCHAR) + ''|'' + CAST(COUNT_BIG(DISTINCT @@@) AS NVARCHAR)' + WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS NVARCHAR) + ''|'' + CAST(MAX(@@@) AS NVARCHAR) + ''|'' + CAST(COUNT_BIG(DISTINCT @@@) AS NVARCHAR) + ''|'' + CAST(SUM(LEN(@@@)) AS NVARCHAR)' + WHEN general_type = 'N' THEN 'CONCAT_WS(''|'', + CAST(COUNT_BIG(@@@) AS VARCHAR(20)), + CAST(COUNT_BIG(DISTINCT CAST(CAST(CAST(COALESCE(@@@,0) AS DECIMAL(38,6)) * 1000000 AS DECIMAL(38,0)) % 1000003 AS INT)) AS VARCHAR(20)), + COALESCE(CAST(CAST(MIN(@@@) AS DECIMAL(38,6)) AS VARCHAR(50)), ''''), + COALESCE(CAST(CAST(MAX(@@@) AS DECIMAL(38,6)) AS VARCHAR(50)), ''''), + CAST((COALESCE(SUM(CAST(CAST(ABS(CAST(COALESCE(@@@,0) AS DECIMAL(38,6))) * 1000000 AS DECIMAL(38,0)) % 1000000007 AS DECIMAL(38,0))), 0) % 1000000007) AS VARCHAR(12)), + CAST((COALESCE(SUM(CAST(CAST(ABS(CAST(COALESCE(@@@,0) AS DECIMAL(38,6))) * 1000000 AS DECIMAL(38,0)) % 1000000009 AS DECIMAL(38,0))), 0) % 1000000009) AS VARCHAR(12)) + )' END, '@@@', '"' || column_name || '"' ), diff --git a/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql b/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql deleted file mode 100644 index b7ccafaf..00000000 --- a/testgen/template/flavors/mssql/profiling/project_get_table_sample_count_mssql.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT CAST(COUNT(*) as FLOAT) as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0, 0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml similarity index 81% rename from testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml rename to testgen/template/flavors/mssql/profiling/project_profiling_query.yaml index 75ed4598..77ec98c8 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +22,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, NULL AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(CAST(NULLIF(LEN("{COL_NAME}"), 0) AS FLOAT)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN LTRIM(RTRIM("{COL_NAME}")) LIKE '0([.]0*)' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -120,7 +132,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -135,7 +147,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) + ' | ' + pattern AS pattern, COUNT(*) AS ct @@ -143,27 +156,16 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + FROM target_table WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH})) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH})) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC ) ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC, val ASC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) + ' | ' + "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*) DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -171,7 +173,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -179,11 +181,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE CASE WHEN MIN("{COL_NAME}") >= CAST('0001-01-01' as date) THEN MIN("{COL_NAME}") ELSE CAST('0001-01-01' as date) END END as min_date, @@ -222,8 +224,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -237,52 +238,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, +14_A: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)' +98_all: ' FROM target_table ' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)' - -strTemplate99_N: | +99_N: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +99_else: ' ' diff --git a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql similarity index 87% rename from testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql rename to testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql index 54505605..4a52c3dc 100644 --- a/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query_mssql.sql +++ b/testgen/template/flavors/mssql/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,17 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) +-- TG-ENDIF + WITH (NOLOCK) + ), -- Get Freqs for selected columns -WITH ranked_vals +ranked_vals AS (SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC) AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" --- TG-IF do_sample_bool - TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) --- TG-ENDIF + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -31,8 +35,7 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(STRING_AGG(CONVERT(NVARCHAR(max), val), '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHAR(10)) AS top_freq_values, (SELECT CONVERT(VARCHAR(40), HASHBYTES('MD5', STRING_AGG( NULLIF(dist_col_name,''), '|') WITHIN GROUP (ORDER BY dist_col_name)), 2) as dvh - FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") a + FROM (SELECT DISTINCT "{COL_NAME}" as dist_col_name FROM target_table) a ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql similarity index 76% rename from testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql rename to testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql index aca74a15..b5fcc322 100644 --- a/testgen/template/flavors/postgresql/data_chars/schema_ddf_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -17,16 +16,14 @@ SELECT '{PROJECT_CODE}' as project_code, END AS column_type, CASE WHEN c.data_type ILIKE 'char%' OR c.data_type ILIKE 'bit%' - THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'numeric' - THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + THEN 'numeric' || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') WHEN c.data_type ILIKE 'time%' - THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - COALESCE(c.character_maximum_length, CASE WHEN c.data_type IN ('text', 'character varying') THEN 65535 END) - as character_maximum_length, c.ordinal_position, CASE WHEN c.data_type ILIKE '%char%' or c.data_type = 'text' @@ -46,7 +43,10 @@ SELECT '{PROJECT_CODE}' as project_code, CASE WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 ELSE numeric_scale > 0 - END as is_decimal + END as is_decimal, + NULLIF(p.reltuples::BIGINT, -1) AS approx_record_ct FROM information_schema.columns c + LEFT JOIN pg_namespace n ON c.table_schema = n.nspname + LEFT JOIN pg_class p ON n.oid = p.relnamespace AND c.table_name = p.relname WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql index 31b99ee1..6088cd63 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM "{SCHEMA_NAME}"."{TABLE_NAME}" diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql index eda6d933..4cf4faf2 100644 --- a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql @@ -3,8 +3,6 @@ SELECT '{TEST_TYPE}' as test_type, '{TEST_SUITE_ID}' as test_suite_id, '{TEST_RUN_ID}' as test_run_id, '{RUN_DATE}' as test_time, - '{START_TIME}' as starttime, - CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME_NO_QUOTES}' as column_names, @@ -27,9 +25,7 @@ SELECT '{TEST_TYPE}' as test_type, ) ELSE 'No errors found.' END AS result_message, - COUNT(*) as result_measure, - '{SUBSET_DISPLAY}' as subset_condition, - NULL as result_query + COUNT(*) as result_measure FROM ( ( SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} diff --git a/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql b/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql deleted file mode 100644 index fd3fe0a1..00000000 --- a/testgen/template/flavors/postgresql/gen_query_tests/gen_table_changed_test.sql +++ /dev/null @@ -1,157 +0,0 @@ -INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id, - schema_name, table_name, - skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, - lock_refresh, history_calculation, history_lookback, custom_query ) -WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date - FROM profile_results p - INNER JOIN profiling_runs r - ON (p.profile_run_id = r.id) - INNER JOIN test_suites ts - ON p.project_code = ts.project_code - AND p.connection_id = ts.connection_id - WHERE p.project_code = '{PROJECT_CODE}' - AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND ts.id = '{TEST_SUITE_ID}' - AND p.run_date::DATE <= '{AS_OF_DATE}' - GROUP BY r.table_groups_id), -curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct - FROM last_run lr - INNER JOIN profile_results p - ON (lr.table_groups_id = p.table_groups_id - AND lr.last_run_date = p.run_date) ), -locked AS (SELECT schema_name, table_name - FROM test_definitions - WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND test_suite_id = '{TEST_SUITE_ID}' - AND test_type = 'Table_Freshness' - AND lock_refresh = 'Y'), --- IDs - TOP 2 -id_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - CASE - WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 - WHEN functional_data_type = 'ID-Secondary' THEN 2 - ELSE 3 - END, distinct_value_ct, column_name DESC) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'ID%'), --- Process Date - TOP 1 -process_date_cols - AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - CASE - WHEN column_name ILIKE '%mod%' THEN 1 - WHEN column_name ILIKE '%up%' THEN 1 - WHEN column_name ILIKE '%cr%' THEN 2 - WHEN column_name ILIKE '%in%' THEN 2 - END , distinct_value_ct DESC, column_name) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'process%'), --- Transaction Date - TOP 1 -tran_date_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, - distinct_value_ct, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY - distinct_value_ct DESC, column_name) AS rank - FROM curprof - WHERE general_type IN ('A', 'D', 'N') - AND functional_data_type ILIKE 'transactional date%' - OR functional_data_type ILIKE 'period%' - OR functional_data_type = 'timestamp' ), - --- Numeric Measures -numeric_cols - AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type, -/* - -- Subscores - distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score, - (max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score, - LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score, - stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score, - 1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty, -*/ - -- Weighted score - ( - 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + - 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + - 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + - 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + - 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) - ) AS change_detection_score - FROM curprof - WHERE general_type = 'N' - AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant')) - ), -numeric_cols_ranked - AS ( SELECT *, - ROW_NUMBER() OVER (PARTITION BY schema_name, table_name - ORDER BY change_detection_score DESC, column_name) as rank - FROM numeric_cols - WHERE change_detection_score IS NOT NULL), -combined - AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order - FROM id_cols - WHERE rank <= 2 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order - FROM process_date_cols - WHERE rank = 1 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order - FROM tran_date_cols - WHERE rank = 1 - UNION ALL - SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order - FROM numeric_cols_ranked - WHERE rank = 1 ), -newtests - AS (SELECT profile_run_id, schema_name, table_name, - 'COUNT(*)::VARCHAR || ''|'' || ' || - STRING_AGG( - REPLACE( - CASE - WHEN general_type = 'D' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR' - WHEN general_type = 'A' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR || ''|'' || SUM(LENGTH(@@@))::VARCHAR' - WHEN general_type = 'N' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || SUM(@@@)::VARCHAR || ''|'' || ROUND(AVG(@@@), 5)::VARCHAR || ''|'' || ROUND(STDDEV(@@@::FLOAT)::NUMERIC, 5)::VARCHAR' - END, - '@@@', '"' || column_name || '"'), - ' || ''|'' || ' - ORDER BY element_type, fingerprint_order, column_name) as fingerprint - FROM combined - GROUP BY profile_run_id, schema_name, table_name) -SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id, - n.profile_run_id, - 'Table_Freshness' AS test_type, - '{TEST_SUITE_ID}' AS test_suite_id, - n.schema_name, n.table_name, - 0 as skip_errors, 'Y' as test_active, - - '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, - '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, - 'N' as lock_refresh, - 'Value' as history_calculation, - 1 as history_lookback, - fingerprint as custom_query -FROM newtests n -INNER JOIN test_types t - ON ('Table_Freshness' = t.test_type - AND 'Y' = t.active) -LEFT JOIN generation_sets s - ON (t.test_type = s.test_type - AND '{GENERATION_SET}' = s.generation_set) -LEFT JOIN locked l - ON (n.schema_name = l.schema_name - AND n.table_name = l.table_name) -WHERE (s.generation_set IS NOT NULL - OR '{GENERATION_SET}' = '') - AND l.schema_name IS NULL; - diff --git a/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql deleted file mode 100644 index 6939bae9..00000000 --- a/testgen/template/flavors/postgresql/profiling/project_get_table_sample_count_postgresql.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END::NUMERIC, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml similarity index 79% rename from testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml rename to testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml index 6bf6631f..67156d77 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +22,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, + +03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, MAX(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS max_length, AVG(NULLIF(LENGTH(CAST("{COL_NAME}" AS TEXT)), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE + +04_A: SUM(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -96,7 +108,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -111,7 +123,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats + +06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats FROM ( SELECT CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct @@ -119,28 +132,17 @@ strTemplate06_A_patterns: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DE "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5 ) ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value, @@ -148,7 +150,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -156,11 +158,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -199,9 +201,7 @@ strTemplate11_D: CASE COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";'{RUN_DATE}'%>) as date_days_present, COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) as date_weeks_present, COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";'{RUN_DATE}'%>) as date_months_present, - - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -215,52 +215,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, - -strTemplate12_else: NULL as boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, - AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64)' +98_all: ' FROM target_table ' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +99_else: ' ' diff --git a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql similarity index 87% rename from testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql rename to testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql index b9b0c3d6..86db6e47 100644 --- a/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/profiling/project_secondary_profiling_query.sql @@ -1,12 +1,15 @@ -- Get Freqs for selected columns -WITH ranked_vals AS ( - SELECT "{COL_NAME}", - COUNT(*) AS ct, - ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" -- TG-IF do_sample_bool TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) -- TG-ENDIF +), +ranked_vals AS ( + SELECT "{COL_NAME}", + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -27,5 +30,5 @@ SELECT '{PROJECT_CODE}' as project_code, '{COL_NAME}' as column_name, REPLACE(STRING_AGG(val, '^#^' ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(STRING_AGG(DISTINCT "{COL_NAME}", '|' ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql b/testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql similarity index 76% rename from testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql rename to testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql index cf61e7ca..6bda34ec 100644 --- a/testgen/template/flavors/redshift/data_chars/schema_ddf_query_redshift.sql +++ b/testgen/template/flavors/redshift/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -14,13 +13,12 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE c.data_type END AS column_type, CASE WHEN c.data_type ILIKE 'char%' - THEN c.data_type || '(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'numeric' - THEN 'numeric' || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + THEN 'numeric' || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type ILIKE 'char%' @@ -40,7 +38,13 @@ SELECT '{PROJECT_CODE}' as project_code, CASE WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 ELSE numeric_scale > 0 - END AS is_decimal + END AS is_decimal, + CASE + WHEN reltuples > 0 AND reltuples < 1 THEN NULL + ELSE reltuples::BIGINT + END AS approx_record_ct FROM information_schema.columns c + LEFT JOIN pg_namespace n ON c.table_schema = n.nspname + LEFT JOIN pg_class p ON n.oid = p.relnamespace AND c.table_name = p.relname WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql b/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql deleted file mode 100644 index 9a62c3d6..00000000 --- a/testgen/template/flavors/redshift/profiling/project_get_table_sample_count_redshift.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml similarity index 76% rename from testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml rename to testgen/template/flavors/redshift/profiling/project_profiling_query.yaml index 8ee6eed3..1055ecd1 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,24 +22,28 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + +04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, @@ -76,7 +88,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -91,33 +103,23 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -125,7 +127,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -133,11 +135,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -153,8 +155,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -168,52 +169,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, - -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' +98_all: ' FROM target_table' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' +99_else: ' ' diff --git a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql similarity index 84% rename from testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql rename to testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql index 58b86519..794275c8 100644 --- a/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query_redshift.sql +++ b/testgen/template/flavors/redshift/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,16 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' --- TG-IF do_sample_bool - AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} --- TG-ENDIF GROUP BY "{COL_NAME}" ), consol_vals AS ( @@ -28,5 +31,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql similarity index 76% rename from testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql rename to testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql index 76ded622..3a6669f3 100644 --- a/testgen/template/flavors/redshift_spectrum/data_chars/schema_ddf_query_redshift_spectrum.sql +++ b/testgen/template/flavors/redshift_spectrum/data_chars/get_schema_ddf.sql @@ -1,14 +1,9 @@ -SELECT '{PROJECT_CODE}' AS project_code, - CURRENT_TIMESTAMP AT TIME ZONE 'UTC' AS refresh_timestamp, - c.schemaname AS table_schema, +SELECT + c.schemaname AS schema_name, c.tablename AS table_name, c.columnname AS column_name, c.external_type AS column_type, c.external_type AS db_data_type, - NULLIF( - REGEXP_SUBSTR(c.external_type, 'char\\(([0-9]+)\\)', 1, 1, 'e'), - '' - ) AS character_maximum_length, c.columnnum AS ordinal_position, CASE WHEN c.external_type = 'string' @@ -29,7 +24,8 @@ SELECT '{PROJECT_CODE}' AS project_code, WHEN REGEXP_SUBSTR(c.external_type, 'decimal\\([0-9]+,([0-9]+)\\)', 1, 1, 'e') > 0 THEN 1 ELSE 0 - END AS is_decimal + END AS is_decimal, + NULL AS approx_record_ct -- Table statistics unavailable FROM svv_external_columns c WHERE c.schemaname = '{DATA_SCHEMA}' {TABLE_CRITERIA} diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql deleted file mode 100644 index 9a62c3d6..00000000 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_get_table_sample_count_redshift_spectrum.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml similarity index 76% rename from testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml rename to testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml index 80b7a583..0e0b6401 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query_redshift_spectrum.yaml +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,24 +22,28 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + +04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, @@ -76,7 +88,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -91,33 +103,23 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -125,7 +127,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -133,11 +135,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: CASE +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -153,8 +155,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -168,52 +169,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, - -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ' +98_all: ' FROM target_table' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: 'WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO}' +99_else: ' ' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql similarity index 84% rename from testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql rename to testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql index 58b86519..794275c8 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query_redshift_spectrum.sql +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_secondary_profiling_query.sql @@ -1,13 +1,16 @@ +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' --- TG-IF do_sample_bool - AND RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} --- TG-ENDIF GROUP BY "{COL_NAME}" ), consol_vals AS ( @@ -28,5 +31,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT "{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}")) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql deleted file mode 100644 index 83cc6091..00000000 --- a/testgen/template/flavors/redshift_spectrum/validate_tests/ex_get_project_column_list.sql +++ /dev/null @@ -1,3 +0,0 @@ -select concat(concat(concat(schemaname, '.'), concat(tablename, '.')), columnname) as columns -from svv_external_columns -where schemaname in ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql new file mode 100644 index 00000000..ebd4ca82 --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT schemaname AS schema_name, + tablename AS table_name, + columnname AS column_name +FROM svv_external_columns +WHERE schemaname IN ({TEST_SCHEMAS}); diff --git a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql b/testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql similarity index 73% rename from testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql rename to testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql index 6e90f897..54940da8 100644 --- a/testgen/template/flavors/snowflake/data_chars/schema_ddf_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/data_chars/get_schema_ddf.sql @@ -1,6 +1,5 @@ -SELECT '{PROJECT_CODE}' as project_code, - CURRENT_TIMESTAMP as refresh_timestamp, - c.table_schema, +SELECT + c.table_schema AS schema_name, c.table_name, c.column_name, CASE @@ -17,15 +16,14 @@ SELECT '{PROJECT_CODE}' as project_code, END AS column_type, CASE WHEN c.data_type = 'TEXT' - THEN 'VARCHAR(' || CAST(c.character_maximum_length AS VARCHAR) || ')' + THEN 'VARCHAR' || COALESCE('(' || CAST(c.character_maximum_length AS VARCHAR) || ')', '') WHEN c.data_type = 'NUMBER' - THEN c.data_type || '(' || CAST(c.numeric_precision AS VARCHAR) || ',' - || CAST(c.numeric_scale AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') WHEN c.data_type ILIKE 'TIME%' - THEN c.data_type || '(' || CAST(c.datetime_precision AS VARCHAR) || ')' + THEN c.data_type || COALESCE('(' || CAST(c.datetime_precision AS VARCHAR) || ')', '') ELSE c.data_type END AS db_data_type, - c.character_maximum_length, c.ordinal_position, CASE WHEN c.data_type = 'TEXT' @@ -43,7 +41,9 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE 'X' END AS general_type, - numeric_scale > 0 AS is_decimal + numeric_scale > 0 AS is_decimal, + t.row_count AS approx_record_ct FROM information_schema.columns c + LEFT JOIN information_schema.tables t ON c.table_schema = t.table_schema AND c.table_name = t.table_name WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position; diff --git a/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql deleted file mode 100644 index 9a62c3d6..00000000 --- a/testgen/template/flavors/snowflake/profiling/project_get_table_sample_count_snowflake.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::FLOAT as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as FLOAT) * CAST(COUNT(*) as FLOAT) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as FLOAT) as min_sample_ct, - CAST(999000 as FLOAT) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml similarity index 78% rename from testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml rename to testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml index a42e3e29..5c04fce8 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +22,30 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: COUNT(CASE + +04_A: COUNT(CASE WHEN REGEXP_LIKE(TRIM("{COL_NAME}"::VARCHAR), '^0(\.0*)?$') THEN 1 END) AS zero_value_ct, -strTemplate04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, COUNT(CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, @@ -83,7 +95,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -98,7 +110,8 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + +06_A: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct @@ -106,26 +119,15 @@ strTemplate06_A_patterns: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (OR "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT LEFT(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) as concat_vals - FROM ( - SELECT TOP 10 CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, - COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, @@ -133,7 +135,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -141,11 +143,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, -strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, +11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, MAX("{COL_NAME}") as max_date, COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, @@ -158,8 +160,7 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -173,54 +174,38 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, - -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id " +16_all: " '{PROFILE_RUN_ID}' as profile_run_id " -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows)' +98_all: ' FROM target_table ' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile - -strTemplate99_else: ; - -strTemplate100_sampling: ' ' +99_else: ; diff --git a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql similarity index 88% rename from testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql rename to testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql index 7b80fc70..2c4264dc 100644 --- a/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query_snowflake.sql +++ b/testgen/template/flavors/snowflake/profiling/project_secondary_profiling_query.sql @@ -1,12 +1,17 @@ +WITH target_table +AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + SAMPLE ({SAMPLE_SIZE} rows) +-- TG-ENDIF +), -- Get Freqs for selected columns -WITH ranked_vals AS ( +ranked_vals +AS ( SELECT "{COL_NAME}", COUNT(*) AS ct, ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" --- TG-IF do_sample_bool - SAMPLE ({SAMPLE_SIZE} rows) --- TG-ENDIF + FROM target_table WHERE "{COL_NAME}" > ' ' GROUP BY "{COL_NAME}" ), @@ -28,5 +33,5 @@ SELECT '{PROJECT_CODE}' as project_code, REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, ( SELECT MD5(LISTAGG(DISTINCT NULLIF("{COL_NAME}", ''), '|') WITHIN GROUP (ORDER BY NULLIF("{COL_NAME}", ''))) as dvh - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" ) as distinct_value_hash + FROM target_table ) as distinct_value_hash FROM consol_vals; diff --git a/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql b/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql deleted file mode 100644 index 23f5a4bf..00000000 --- a/testgen/template/flavors/trino/profiling/project_get_table_sample_count_trino.sql +++ /dev/null @@ -1,23 +0,0 @@ -WITH stats - AS (SELECT COUNT(*)::REAL as record_ct, - ROUND(CAST({PROFILE_SAMPLE_PERCENT} as REAL) * CAST(COUNT(*) as REAL) / 100.0) as calc_sample_ct, - CAST({PROFILE_SAMPLE_MIN_COUNT} as REAL) as min_sample_ct, - CAST(999000 as REAL) as max_sample_ct - FROM {SAMPLING_TABLE} ) -SELECT '{SAMPLING_TABLE}' as schema_table, - CASE WHEN record_ct <= min_sample_ct THEN -1 - WHEN calc_sample_ct > max_sample_ct THEN max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN calc_sample_ct - ELSE {PROFILE_SAMPLE_MIN_COUNT} - END as sample_count, - CASE WHEN record_ct <= min_sample_ct THEN 1 - WHEN calc_sample_ct > max_sample_ct THEN record_ct / max_sample_ct - WHEN calc_sample_ct > min_sample_ct THEN record_ct / calc_sample_ct - ELSE record_ct / min_sample_ct - END as sample_ratio, - ROUND(CASE WHEN record_ct <= min_sample_ct THEN 100 - WHEN calc_sample_ct > max_sample_ct THEN 100.0 * max_sample_ct / record_ct - WHEN calc_sample_ct > min_sample_ct THEN 100.0 * calc_sample_ct / record_ct - ELSE 100.0 * min_sample_ct / record_ct - END, 4) as sample_percent_calc - FROM stats; diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml similarity index 80% rename from testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml rename to testgen/template/flavors/trino/profiling/project_profiling_query.yaml index 313f79bd..3346003d 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query.yaml @@ -1,7 +1,15 @@ --- -strTemplate01_sampling: "SELECT " -strTemplate01_else: "SELECT " -strTemplate01_5: | +01_sampling: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) + ) + SELECT +01_else: | + WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + ) + SELECT +01_all: | {CONNECTION_ID} as connection_id, '{PROJECT_CODE}' as project_code, '{TABLE_GROUPS_ID}' as table_groups_id, @@ -14,26 +22,29 @@ strTemplate01_5: | '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, -strTemplate02_X: | + +02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate02_else: | +02_else: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -strTemplate03_ADN: MIN(LENGTH("{COL_NAME}")) AS min_length, + +03_ADN: MIN(LENGTH("{COL_NAME}")) AS min_length, MAX(LENGTH("{COL_NAME}")) AS max_length, AVG(CAST(NULLIF(LENGTH("{COL_NAME}"), 0) AS REAL)) AS avg_length, -strTemplate03_else: NULL as min_length, +03_else: NULL as min_length, NULL as max_length, NULL as avg_length, -strTemplate04_A: SUM(CASE +04_A: SUM(CASE WHEN REGEXP_LIKE(TRIM("{COL_NAME}") , '^0(\.0*)?$') = TRUE THEN 1 ELSE 0 END) AS zero_value_ct, -strTemplate04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")) ) AS BIGINT) AS zero_value_ct, -strTemplate04_else: NULL as zero_value_ct, -strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, +04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")) ) AS BIGINT) AS zero_value_ct, +04_else: NULL as zero_value_ct, + +05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 @@ -97,7 +108,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a AND SUBSTRING("{COL_NAME}", 1, 3) NOT BETWEEN '734' AND '749' AND SUBSTRING("{COL_NAME}", 1, 3) <> '666' THEN 1 END) AS REAL)/CAST(COUNT("{COL_NAME}") AS REAL) > 0.9 THEN 'SSN' END as std_pattern_match, -strTemplate05_else: NULL as distinct_std_value_ct, +05_else: NULL as distinct_std_value_ct, NULL as zero_length_ct, NULL as lead_space_ct, NULL as quoted_value_ct, @@ -112,33 +123,23 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, -strTemplate06_A_patterns: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) AS concat_pats + +06_A: (SELECT SUBSTRING(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) AS concat_pats FROM ( SELECT CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') AS pattern - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) - FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {PARM_MAX_PATTERN_LENGTH}) p + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p GROUP BY pattern HAVING pattern > ' ' ORDER BY COUNT(*) DESC LIMIT 5) as ps) AS top_patterns, -strTemplate06_else: NULL as top_patterns, -strTemplate07_A_freq: ( SELECT SUBSTRING(LISTAGG(val, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) as concat_vals - FROM ( - SELECT CAST(COUNT(*) as VARCHAR(10)) || ' | ' || "{COL_NAME}" as val, COUNT(*) as ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - WHERE "{COL_NAME}" > ' ' - GROUP BY "{COL_NAME}" - HAVING "{COL_NAME}" > ' ' - ORDER BY COUNT(*), "{COL_NAME}" DESC - LIMIT 10 - ) ps - ) AS top_freq_values, -strTemplate07_else: NULL as top_freq_values, -strTemplate08_N: MIN("{COL_NAME}") AS min_value, +06_else: NULL as top_patterns, + +08_N: MIN("{COL_NAME}") AS min_value, MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, MAX("{COL_NAME}") AS max_value, AVG(CAST("{COL_NAME}" AS REAL)) AS avg_value, @@ -146,7 +147,7 @@ strTemplate08_N: MIN("{COL_NAME}") AS min_value, MIN(pct_25) as percentile_25, MIN(pct_50) as percentile_50, MIN(pct_75) as percentile_75, -strTemplate08_else: NULL as min_value, +08_else: NULL as min_value, NULL as min_value_over_0, NULL as max_value, NULL as avg_value, @@ -154,10 +155,11 @@ strTemplate08_else: NULL as min_value, NULL as percentile_25, NULL as percentile_50, NULL as percentile_75, -strTemplate10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -strTemplate10_else: NULL as fractional_sum, -strTemplate11_D: CASE +10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, +10_else: NULL as fractional_sum, + +11_D: CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') END as min_date, @@ -196,8 +198,7 @@ strTemplate11_D: CASE COUNT(DISTINCT DATE_DIFF('day', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATE_DIFF('week', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATE_DIFF('month', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_months_present, - -strTemplate11_else: NULL as min_date, +11_else: NULL as min_date, NULL as max_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, @@ -211,52 +212,36 @@ strTemplate11_else: NULL as min_date, NULL as date_weeks_present, NULL as date_months_present, -strTemplate12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +12_else: NULL as boolean_true_ct, -strTemplate12_else: NULL as boolean_true_ct, - -strTemplate13_ALL: NULL AS datatype_suggestion, -strTemplate14_A_do_patterns: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( +14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') ) AS pattern_ct - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, - -strTemplate14_A_no_patterns: NULL as distinct_pattern_ct, - SUM(CAST(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ') AS REAL)) AS avg_embedded_spaces, - -strTemplate14_else: NULL as distinct_pattern_ct, +14_else: NULL as distinct_pattern_ct, NULL as embedded_space_ct, NULL as avg_embedded_spaces, -strTemplate15_ALL: NULL as functional_data_type, - NULL as functional_table_type, - -strTemplate16_ALL: " '{PROFILE_RUN_ID}' as profile_run_id" +16_all: " '{PROFILE_RUN_ID}' as profile_run_id" -strTemplate98_sampling: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC})' +98_all: ' FROM target_table' -strTemplate98_else: ' FROM "{DATA_SCHEMA}"."{DATA_TABLE}"' - -strTemplate99_N: | +99_N: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile - -strTemplate99_N_sampling: | +99_N_sampling: | , (SELECT APPROX_PERCENTILE("{COL_NAME}", 0.25) AS pct_25, APPROX_PERCENTILE("{COL_NAME}", 0.50) AS pct_50, APPROX_PERCENTILE("{COL_NAME}", 0.75) AS pct_75 FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE SYSTEM ({SAMPLE_PERCENT_CALC}) ) pctile - -strTemplate99_else: ' ' - -strTemplate100_sampling: ' ' +99_else: ' ' diff --git a/testgen/template/gen_query_tests/gen_table_changed_test.sql b/testgen/template/gen_query_tests/gen_table_changed_test.sql index 918af282..4c578f13 100644 --- a/testgen/template/gen_query_tests/gen_table_changed_test.sql +++ b/testgen/template/gen_query_tests/gen_table_changed_test.sql @@ -121,7 +121,12 @@ newtests CASE WHEN general_type = 'D' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR' WHEN general_type = 'A' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || COUNT(DISTINCT @@@)::VARCHAR || ''|'' || SUM(LENGTH(@@@))::VARCHAR' - WHEN general_type = 'N' THEN 'MIN(@@@)::VARCHAR || ''|'' || MAX(@@@::VARCHAR) || ''|'' || SUM(@@@)::VARCHAR || ''|'' || ROUND(AVG(@@@), 5)::VARCHAR || ''|'' || ROUND(STDDEV(@@@::FLOAT), 5)::VARCHAR' + WHEN general_type = 'N' THEN 'COUNT(@@@)::VARCHAR || ''|'' || + COUNT(DISTINCT MOD((COALESCE(@@@,0)::DECIMAL(38,6) * 1000000)::DECIMAL(38,0), 1000003))::VARCHAR || ''|'' || + COALESCE((MIN(@@@)::DECIMAL(38,6))::VARCHAR, '''') || ''|'' || + COALESCE((MAX(@@@)::DECIMAL(38,6))::VARCHAR, '''') || ''|'' || + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000007)), 0), 1000000007)::VARCHAR, '''') || ''|'' || + COALESCE(MOD(COALESCE(SUM(MOD((ABS(COALESCE(@@@,0))::DECIMAL(38,6) * 1000000)::DECIMAL, 1000000009)), 0), 1000000009)::VARCHAR, '''')' END, '@@@', '"' || column_name || '"'), ' || ''|'' || ' diff --git a/testgen/template/get_entities/get_test_info.sql b/testgen/template/get_entities/get_test_info.sql index b941cc23..142ddc63 100644 --- a/testgen/template/get_entities/get_test_info.sql +++ b/testgen/template/get_entities/get_test_info.sql @@ -4,7 +4,6 @@ Alternative: project-code, connection-id Optional: last_auto_run_date (==test-gen-run-id==), schema-name, table-name, column-name*/ SELECT ts.project_code as project_key, - td.cat_test_id, ts.test_suite as test_suite_key, td.test_type, COALESCE(td.test_description, tt.test_description) as test_description, diff --git a/testgen/template/get_entities/get_test_suite.sql b/testgen/template/get_entities/get_test_suite.sql index b602768d..fdbd9638 100644 --- a/testgen/template/get_entities/get_test_suite.sql +++ b/testgen/template/get_entities/get_test_suite.sql @@ -4,8 +4,6 @@ SELECT test_suite as test_suite_key, connection_id, test_suite_description, - test_action as default_test_action, - test_suite_schema, component_key, component_type FROM test_suites diff --git a/testgen/template/observability/get_test_results.sql b/testgen/template/observability/get_test_results.sql index 85ab567a..077f8724 100644 --- a/testgen/template/observability/get_test_results.sql +++ b/testgen/template/observability/get_test_results.sql @@ -1,7 +1,6 @@ SELECT project_name, component_tool, - "schema", connection_name, project_db, sample_min_count, diff --git a/testgen/template/parms/parms_profiling.sql b/testgen/template/parms/parms_profiling.sql deleted file mode 100644 index 7b98d41f..00000000 --- a/testgen/template/parms/parms_profiling.sql +++ /dev/null @@ -1,28 +0,0 @@ -SELECT tg.project_code, - tg.id::VARCHAR(50) as table_groups_id, - tg.table_group_schema, - tg.table_group_schema, - CASE - WHEN tg.profiling_table_set ILIKE '''%''' THEN tg.profiling_table_set - ELSE fn_format_csv_quotes(tg.profiling_table_set) - END as profiling_table_set, - tg.profiling_include_mask, - tg.profiling_exclude_mask, - tg.profile_id_column_mask, - tg.profile_sk_column_mask, - tg.profile_use_sampling, - tg.profile_flag_cdes, - tg.profile_sample_percent, - tg.profile_sample_min_count, - tg.profile_do_pair_rules, - tg.profile_pair_rule_pct, - CASE - WHEN tg.monitor_test_suite_id IS NULL THEN NULL - ELSE tg.monitor_test_suite_id::VARCHAR(50) - END as monitor_test_suite_id, - CASE - WHEN tg.last_complete_profile_run_id is NULL THEN NULL - ELSE tg.last_complete_profile_run_id::VARCHAR(50) - END as last_complete_profile_run_id - FROM table_groups tg - WHERE tg.id = :TABLE_GROUP_ID; diff --git a/testgen/template/parms/parms_test_execution.sql b/testgen/template/parms/parms_test_execution.sql deleted file mode 100644 index f81b0c2f..00000000 --- a/testgen/template/parms/parms_test_execution.sql +++ /dev/null @@ -1,14 +0,0 @@ -SELECT ts.project_code, - ts.id::VARCHAR as test_suite_id, - ts.table_groups_id::VARCHAR, - tg.table_group_schema, - CASE - WHEN tg.profiling_table_set ILIKE '''%''' THEN tg.profiling_table_set - ELSE fn_format_csv_quotes(tg.profiling_table_set) - END as profiling_table_set, - tg.profiling_include_mask, - tg.profiling_exclude_mask - FROM test_suites ts - JOIN table_groups tg ON (ts.table_groups_id = tg.id) - WHERE ts.project_code = :PROJECT_CODE - AND ts.test_suite = :TEST_SUITE; diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index af610dbe..e3c66599 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -590,4 +590,30 @@ WHERE profile_run_id = :PROFILE_RUN_ID AND (TRIM(SPLIT_PART(top_patterns, '|', 4)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 2) AND (TRIM(SPLIT_PART(top_patterns, '|', 6)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 3); +--- Update column characteristics --- + +WITH new_chars AS ( + SELECT table_groups_id, + schema_name, + table_name, + column_name, + general_type, + functional_data_type + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID + AND profile_run_id = :PROFILE_RUN_ID +) +UPDATE data_column_chars +SET general_type = n.general_type, + functional_data_type = COALESCE(n.functional_data_type, d.functional_data_type) +FROM new_chars n + INNER JOIN data_column_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + AND n.column_name = d.column_name + ) +WHERE data_column_chars.table_id = d.table_id + AND data_column_chars.column_name = d.column_name; + --- END OF QUERY --- diff --git a/testgen/template/profiling/functional_tabletype_update.sql b/testgen/template/profiling/functional_tabletype_update.sql index 3ae8c595..407bffbb 100644 --- a/testgen/template/profiling/functional_tabletype_update.sql +++ b/testgen/template/profiling/functional_tabletype_update.sql @@ -6,3 +6,27 @@ WHERE s.project_code = profile_results.project_code AND s.table_name = profile_results.table_name AND s.run_date = profile_results.run_date AND s.run_date = :RUN_DATE; + +--- Update table characteristics --- + +WITH new_chars AS ( + SELECT table_groups_id, + schema_name, + table_name, + functional_table_type + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID + GROUP BY table_groups_id, + schema_name, + table_name, + functional_table_type +) +UPDATE data_table_chars +SET functional_table_type = COALESCE(n.functional_table_type, d.functional_table_type) +FROM new_chars n + INNER JOIN data_table_chars d ON ( + n.table_groups_id = d.table_groups_id + AND n.schema_name = d.schema_name + AND n.table_name = d.table_name + ) +WHERE data_table_chars.table_id = d.table_id; diff --git a/testgen/template/profiling/project_profile_run_record_insert.sql b/testgen/template/profiling/project_profile_run_record_insert.sql deleted file mode 100644 index e1c379fc..00000000 --- a/testgen/template/profiling/project_profile_run_record_insert.sql +++ /dev/null @@ -1,8 +0,0 @@ -INSERT INTO profiling_runs (id, project_code, connection_id, table_groups_id, profiling_starttime, process_id) -(SELECT :PROFILE_RUN_ID as id, - :PROJECT_CODE as project_code, - :CONNECTION_ID as connection_id, - :TABLE_GROUPS_ID as table_groups_id, - :RUN_DATE as profiling_starttime, - :PROCESS_ID as process_id - ); diff --git a/testgen/template/profiling/project_profile_run_record_update.sql b/testgen/template/profiling/project_profile_run_record_update.sql deleted file mode 100644 index e6c7b0de..00000000 --- a/testgen/template/profiling/project_profile_run_record_update.sql +++ /dev/null @@ -1,5 +0,0 @@ -UPDATE profiling_runs -SET status = CASE WHEN length(:EXCEPTION_MESSAGE) = 0 then 'Complete' else 'Error' end, - profiling_endtime = :NOW_TIMESTAMP, - log_message = :EXCEPTION_MESSAGE -where id = :PROFILE_RUN_ID; diff --git a/testgen/template/profiling/project_update_profile_results_to_estimates.sql b/testgen/template/profiling/project_update_profile_results_to_estimates.sql index e5a8741f..7e4d3535 100644 --- a/testgen/template/profiling/project_update_profile_results_to_estimates.sql +++ b/testgen/template/profiling/project_update_profile_results_to_estimates.sql @@ -24,8 +24,8 @@ set sample_ratio = :PROFILE_SAMPLE_RATIO, future_date_ct = ROUND(future_date_ct * :PROFILE_SAMPLE_RATIO, 0), boolean_true_ct = ROUND(boolean_true_ct * :PROFILE_SAMPLE_RATIO, 0) where profile_run_id = :PROFILE_RUN_ID -and schema_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 1), :QUOTE) -and table_name = TRIM(SPLIT_PART(:SAMPLING_TABLE, '.', 2), :QUOTE) +and schema_name = :DATA_SCHEMA +and table_name = :SAMPLING_TABLE and sample_ratio IS NULL; diff --git a/testgen/template/profiling/refresh_anomalies.sql b/testgen/template/profiling/refresh_anomalies.sql index 9159fbf5..b97f9ce3 100644 --- a/testgen/template/profiling/refresh_anomalies.sql +++ b/testgen/template/profiling/refresh_anomalies.sql @@ -1,32 +1,15 @@ -WITH anomalies +WITH stats AS ( SELECT profile_run_id, COUNT(*) as anomaly_ct, COUNT(DISTINCT schema_name || '.' || table_name) as anomaly_table_ct, COUNT(DISTINCT schema_name || '.' || table_name || '.' || column_name) as anomaly_column_ct FROM profile_anomaly_results WHERE profile_run_id = :PROFILE_RUN_ID - GROUP BY profile_run_id ), -profiles - AS ( SELECT r.id as profile_run_id, - COUNT(DISTINCT p.schema_name || '.' || p.table_name) as table_ct, - COUNT(*) as column_ct - FROM profiling_runs r - INNER JOIN profile_results p - ON r.id = p.profile_run_id - WHERE r.id = :PROFILE_RUN_ID - GROUP BY r.id ), -stats - AS ( SELECT p.profile_run_id, table_ct, column_ct, - a.anomaly_ct, a.anomaly_table_ct, a.anomaly_column_ct - FROM profiles p - LEFT JOIN anomalies a - ON (p.profile_run_id = a.profile_run_id) ) + GROUP BY profile_run_id ) UPDATE profiling_runs - SET table_ct = stats.table_ct, - column_ct = stats.column_ct, - anomaly_ct = COALESCE(stats.anomaly_ct, 0), + SET anomaly_ct = COALESCE(stats.anomaly_ct, 0), anomaly_table_ct = COALESCE(stats.anomaly_table_ct, 0), anomaly_column_ct = COALESCE(stats.anomaly_column_ct, 0) FROM stats diff --git a/testgen/template/quick_start/initial_data_seeding.sql b/testgen/template/quick_start/initial_data_seeding.sql index 6d9e76ed..f47a161b 100644 --- a/testgen/template/quick_start/initial_data_seeding.sql +++ b/testgen/template/quick_start/initial_data_seeding.sql @@ -31,9 +31,10 @@ SELECT '0ea85e17-acbe-47fe-8394-9970725ad37d'::UUID as id, 15000 as profile_sample_min_count; INSERT INTO test_suites - (project_code, test_suite, connection_id, table_groups_id, test_suite_description, + (id, project_code, test_suite, connection_id, table_groups_id, test_suite_description, export_to_observability, component_key, component_type) -SELECT '{PROJECT_CODE}' as project_code, +SELECT '9df7489d-92b3-49f9-95ca-512160d7896f'::UUID as id, + '{PROJECT_CODE}' as project_code, '{TEST_SUITE}' as test_suite, 1 as connection_id, '0ea85e17-acbe-47fe-8394-9970725ad37d'::UUID as table_groups_id, diff --git a/testgen/template/execution/ex_calc_prevalence_test_results.sql b/testgen/template/rollup_scores/calc_prevalence_test_results.sql similarity index 96% rename from testgen/template/execution/ex_calc_prevalence_test_results.sql rename to testgen/template/rollup_scores/calc_prevalence_test_results.sql index 95e09b48..88fdb6fb 100644 --- a/testgen/template/execution/ex_calc_prevalence_test_results.sql +++ b/testgen/template/rollup_scores/calc_prevalence_test_results.sql @@ -14,7 +14,7 @@ UPDATE test_results INNER JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND test_results.id = r.id; -- PROFILED COLUMN TESTS: Update to calculated prevalence for all fails/warnings - result_code = 0 @@ -51,7 +51,7 @@ WITH result_calc LEFT JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND result_code = 0 AND r.result_measure IS NOT NULL AND tt.test_scope = 'column' @@ -79,7 +79,7 @@ WITH result_calc INNER JOIN data_table_chars tc ON (r.table_groups_id = tc.table_groups_id AND r.table_name ILIKE tc.table_name) - WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + WHERE r.test_run_id = '{RUN_ID}'::UUID AND result_code = 0 AND r.result_measure IS NOT NULL AND tt.test_scope <> 'column' diff --git a/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql b/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql deleted file mode 100644 index 67478434..00000000 --- a/testgen/template/validate_tests/ex_disable_tests_test_definitions.sql +++ /dev/null @@ -1,4 +0,0 @@ -UPDATE test_definitions - SET test_active = 'N' - WHERE test_suite_id = :TEST_SUITE_ID - AND test_active = 'D'; diff --git a/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql b/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql deleted file mode 100644 index 5d0b5a58..00000000 --- a/testgen/template/validate_tests/ex_flag_tests_test_definitions.sql +++ /dev/null @@ -1,7 +0,0 @@ -/* -Mark Test inactive for Missing columns/tables with update status -*/ -UPDATE test_definitions -SET test_active = :FLAG, - test_definition_status = LEFT('Inactivated ' || :RUN_DATE || ': ' || CONCAT_WS('; ', substring(test_definition_status from 34), :MESSAGE), 200) -WHERE cat_test_id IN :CAT_TEST_IDS; diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql deleted file mode 100644 index f7a1474f..00000000 --- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql +++ /dev/null @@ -1,98 +0,0 @@ - SELECT schema_name || '.' || table_name || '.' || column_name AS columns, - ARRAY_AGG(cat_test_id) as test_id_array - FROM ( - -- FROM: column_name - column scope (single column) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'column' - UNION - -- FROM: column_name - referential scope (could be multiple columns) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(column_name, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - AND t.test_type NOT LIKE 'Aggregate_%' - UNION - -- FROM: groupby_names - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') AS column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope IN ('column', 'referential', 'table') - UNION - -- FROM: window_date_column (referential) - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(window_date_column, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - UNION - -- FROM: match_column_names (referential) - SELECT cat_test_id, - match_schema_name AS schema_name, - match_table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(match_column_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - AND t.test_type NOT LIKE 'Aggregate_%' - UNION - -- FROM: match_groupby_names (referential) - SELECT cat_test_id, - match_schema_name AS schema_name, - match_table_name AS table_name, - TRIM(TRIM(UNNEST(ARRAY_REMOVE( - REGEXP_SPLIT_TO_ARRAY(match_groupby_names, ',(?=(?:[^"]*"[^"]*")*[^"]*$)'), - '' )), ' '), '{QUOTE}') as column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'referential' - UNION - SELECT cat_test_id, - schema_name AS schema_name, - table_name AS table_name, - '' AS column_name - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE test_suite_id = :TEST_SUITE_ID - AND COALESCE(test_active, 'Y') = 'Y' - AND t.test_scope = 'table' ) cols -GROUP BY columns; diff --git a/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql b/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql deleted file mode 100644 index d436a3ca..00000000 --- a/testgen/template/validate_tests/ex_prep_flag_tests_test_definitions.sql +++ /dev/null @@ -1,6 +0,0 @@ -/* -Clean the test definition status before it's set with missing tables / columns information -*/ -UPDATE test_definitions -SET test_definition_status = NULL -WHERE cat_test_id IN :CAT_TEST_IDS; diff --git a/testgen/template/validate_tests/ex_write_test_val_errors.sql b/testgen/template/validate_tests/ex_write_test_val_errors.sql deleted file mode 100644 index 318d76bf..00000000 --- a/testgen/template/validate_tests/ex_write_test_val_errors.sql +++ /dev/null @@ -1,30 +0,0 @@ -INSERT INTO test_results - ( test_suite_id, - test_type, - test_definition_id, - schema_name, - table_name, - column_names, - test_time, - test_run_id, - input_parameters, - result_code, - result_status, - result_message, - result_measure ) - SELECT :TEST_SUITE_ID, - test_type, - id, - schema_name, - table_name, - column_name, - :RUN_DATE as test_time, - :TEST_RUN_ID as test_run_id, - NULL as input_parameters, - NULL as result_code, - 'Error' as result_status, - test_definition_status AS result_message, - NULL as result_measure - FROM test_definitions - WHERE test_active = 'D' - AND test_suite_id = :TEST_SUITE_ID; diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index 335858ad..7665ae48 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -157,7 +157,7 @@ body { } .table-row { - padding: 12px 0; + padding: 8px 0; } .table.hoverable .table-row:hover { @@ -168,10 +168,6 @@ body { border-bottom: var(--button-stroked-border); } -.table-row:last-child { - padding-bottom: 0; -} - .table-header { border-bottom: var(--button-stroked-border); padding: 0 0 8px 0; @@ -216,6 +212,12 @@ body { .text-capitalize { text-transform: capitalize; } + +.text-code { + font-family:'Courier New', Courier, monospace; + line-height: 1.5; + white-space: pre-wrap; +} /* */ /* Flex utilities */ @@ -638,6 +640,10 @@ code > .tg-icon:hover { border-radius: 4px; } +.border-radius-2 { + border-radius: 8px; +} + input::-ms-reveal, input::-ms-clear { display: none; diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 5d6aa7e5..011e425a 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -31,6 +31,7 @@ * @property {boolean} connect_by_url * @property {string?} url * @property {boolean} connect_by_key + * @property {boolean} connect_with_identity * @property {string?} private_key * @property {string?} private_key_passphrase * @property {string?} http_path @@ -105,7 +106,7 @@ const ConnectionForm = (props, saveButton) => { const connectionFlavor = van.state(connection?.sql_flavor_code); const connectionName = van.state(connection?.connection_name ?? ''); const connectionMaxThreads = van.state(connection?.max_threads ?? 4); - const connectionQueryChars = van.state(connection?.max_query_chars ?? 9000); + const connectionQueryChars = van.state(connection?.max_query_chars ?? 20000); const privateKeyFile = van.state(getValue(props.cachedPrivateKeyFile) ?? null); const serviceAccountKeyFile = van.state(getValue(props.cachedServiceAccountKeyFile) ?? null); @@ -126,10 +127,11 @@ const ConnectionForm = (props, saveButton) => { warehouse: connection?.warehouse ?? '', url: connection?.url ?? '', service_account_key: connection?.service_account_key ?? '', + connect_with_identity: connection?.connect_with_identity ?? false, sql_flavor_code: connectionFlavor.rawVal ?? '', connection_name: connectionName.rawVal ?? '', max_threads: connectionMaxThreads.rawVal ?? 4, - max_query_chars: connectionQueryChars.rawVal ?? 9000, + max_query_chars: connectionQueryChars.rawVal ?? 20000, }); const dynamicConnectionUrl = van.state(props.dynamicConnectionUrl?.rawVal ?? ''); @@ -335,7 +337,7 @@ const ConnectionForm = (props, saveButton) => { hint: 'Some tests are consolidated into queries for maximum performance. Default values should be retained unless test queries are failing.', value: connectionQueryChars.rawVal, min: 500, - max: 14000, + max: 50000, onChange: (value) => connectionQueryChars.val = value, }), ), @@ -550,7 +552,197 @@ const RedshiftSpectrumForm = RedshiftForm; const PostgresqlForm = RedshiftForm; -const AzureMSSQLForm = RedshiftForm; +const AzureMSSQLForm = ( + connection, + flavor, + onChange, + originalConnection, + dynamicConnectionUrl, +) => { + const isValid = van.state(true); + const connectByUrl = van.state(connection.rawVal.connect_by_url ?? false); + const connectionHost = van.state(connection.rawVal.project_host ?? ''); + const connectionPort = van.state(connection.rawVal.project_port || defaultPorts[flavor.flavor]); + const connectionDatabase = van.state(connection.rawVal.project_db ?? ''); + const connectionUsername = van.state(connection.rawVal.project_user ?? ''); + const connectionPassword = van.state(connection.rawVal?.project_pw_encrypted ?? ''); + const connectionUrl = van.state(connection.rawVal?.url ?? ''); + const connectWithIdentity = van.state(connection.rawVal?.connect_with_identity ?? ''); + + const validityPerField = {}; + + van.derive(() => { + onChange({ + project_host: connectionHost.val, + project_port: connectionPort.val, + project_db: connectionDatabase.val, + project_user: connectionUsername.val, + project_pw_encrypted: connectionPassword.val, + connect_by_url: connectByUrl.val, + url: connectByUrl.val ? connectionUrl.val : connectionUrl.rawVal, + connect_by_key: false, + connect_with_identity: connectWithIdentity.val, + }, isValid.val); + }); + + van.derive(() => { + const newUrlValue = (dynamicConnectionUrl.val ?? '').replace(extractPrefix(dynamicConnectionUrl.rawVal), ''); + if (!connectByUrl.rawVal) { + connectionUrl.val = newUrlValue; + } + }); + + return div( + {class: 'flex-column fx-gap-3 fx-flex'}, + div( + { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, + Caption({content: 'Server', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), + RadioGroup({ + label: 'Connect by', + options: [ + { + label: 'Host', + value: false, + }, + { + label: 'URL', + value: true, + }, + ], + value: connectByUrl, + onChange: (value) => connectByUrl.val = value, + layout: 'inline', + }), + div( + { class: 'flex-row fx-gap-3 fx-flex' }, + Input({ + name: 'db_host', + label: 'Host', + value: connectionHost, + class: 'fx-flex', + disabled: connectByUrl, + onChange: (value, state) => { + connectionHost.val = value; + validityPerField['db_host'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + maxLength(250), + requiredIf(() => !connectByUrl.val), + ], + }), + Input({ + name: 'db_port', + label: 'Port', + value: connectionPort, + type: 'number', + disabled: connectByUrl, + onChange: (value, state) => { + connectionPort.val = value; + validityPerField['db_port'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + minLength(3), + maxLength(5), + requiredIf(() => !connectByUrl.val), + ], + }) + ), + Input({ + name: 'db_name', + label: 'Database', + value: connectionDatabase, + disabled: connectByUrl, + onChange: (value, state) => { + connectionDatabase.val = value; + validityPerField['db_name'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + maxLength(100), + requiredIf(() => !connectByUrl.val), + ], + }), + () => div( + { class: 'flex-row fx-gap-3 fx-align-stretch', style: 'position: relative;' }, + Input({ + label: 'URL', + value: connectionUrl, + class: 'fx-flex', + name: 'url_suffix', + prefix: span({ style: 'white-space: nowrap; color: var(--disabled-text-color)' }, extractPrefix(dynamicConnectionUrl.val)), + disabled: !connectByUrl.val, + onChange: (value, state) => { + connectionUrl.val = value; + validityPerField['url_suffix'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => connectByUrl.val), + ], + }), + ), + ), + + div( + { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, + Caption({content: 'Authentication', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), + + RadioGroup({ + label: 'Connection Strategy', + options: [ + {label: 'Connect By Password', value: false}, + {label: 'Connect with Managed Identity', value: true}, + ], + value: connectWithIdentity, + onChange: (value) => connectWithIdentity.val = value, + layout: 'inline', + }), + + () => { + const _connectWithIdentity = connectWithIdentity.val; + if (_connectWithIdentity) { + return div( + {class: 'flex-row p-4 fx-justify-center text-secondary'}, + 'Microsoft Entra ID credentials configured on host machine will be used', + ); + } + + return div( + {class: 'flex-column fx-gap-1'}, + Input({ + name: 'db_user', + label: 'Username', + value: connectionUsername, + onChange: (value, state) => { + connectionUsername.val = value; + validityPerField['db_user'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !connectWithIdentity.val), + maxLength(50), + ], + }), + Input({ + name: 'password', + label: 'Password', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + }), + ) + }, + ), + ); +}; const SynapseMSSQLForm = RedshiftForm; @@ -1110,11 +1302,19 @@ const BigqueryForm = ( }; function extractPrefix(url) { - const parts = (url ?? '').split('@'); - if (!parts[0]) { + if (!url) { return ''; } - return `${parts[0]}@`; + + if (url.includes('@')) { + const parts = url.split('@'); + if (!parts[0]) { + return ''; + } + return `${parts[0]}@`; + } + + return url.slice(0, url.indexOf('://') + 3); } function shouldRefreshUrl(previous, current) { @@ -1122,7 +1322,7 @@ function shouldRefreshUrl(previous, current) { return false; } - const fields = ['sql_flavor', 'project_host', 'project_port', 'project_db', 'project_user', 'connect_by_key', 'http_path', 'warehouse']; + const fields = ['sql_flavor', 'project_host', 'project_port', 'project_db', 'project_user', 'connect_by_key', 'http_path', 'warehouse', 'connect_with_identity']; return fields.some((fieldName) => previous[fieldName] !== current[fieldName]); } diff --git a/testgen/ui/components/frontend/js/components/score_issues.js b/testgen/ui/components/frontend/js/components/score_issues.js index 46127703..659f8020 100644 --- a/testgen/ui/components/frontend/js/components/score_issues.js +++ b/testgen/ui/components/frontend/js/components/score_issues.js @@ -76,7 +76,7 @@ const IssuesTable = ( const selectedIssues = van.state([]); return div( - { class: 'table', 'data-testid': 'score-issues' }, + { class: 'table pb-0', 'data-testid': 'score-issues' }, div( { class: 'flex-row fx-justify-space-between fx-align-flex-start'}, div( diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index 72bb11cc..e7350424 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -3,7 +3,7 @@ * @type {object} * @property {string} label * @property {string} value - * @property {boolean} selected + * @property {boolean?} selected * @property {string?} icon * * @typedef Properties diff --git a/testgen/ui/components/frontend/js/components/table_group_stats.js b/testgen/ui/components/frontend/js/components/table_group_stats.js new file mode 100644 index 00000000..361118cd --- /dev/null +++ b/testgen/ui/components/frontend/js/components/table_group_stats.js @@ -0,0 +1,130 @@ +/** + * @typedef TableGroupStats + * @type {object} + * @property {string} id + * @property {string} table_groups_name + * @property {string} table_group_schema + * @property {number} table_ct + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number?} record_ct + * @property {number} approx_data_point_ct + * @property {number?} data_point_ct + * + * @typedef Properties + * @type {object} + * @property {boolean?} hideApproxCaption + * @property {boolean?} hideWarning + * @property {string?} class + */ +import van from '../van.min.js'; +import { formatNumber } from '../display_utils.js'; +import { Alert } from '../components/alert.js'; + +const { div, span, strong } = van.tags; +const profilingWarningText = 'Profiling on large datasets could be time-consuming or resource-intensive, depending on your database configuration.'; + +/** + * @param {Properties} props + * @param {TableGroupStats} stats + * @returns {HTMLElement} + */ +const TableGroupStats = (props, stats) => { + const useApprox = stats.record_ct === null || stats.record_ct === undefined; + const rowCount = useApprox ? stats.approx_record_ct : stats.record_ct; + const dataPointCount = useApprox ? stats.approx_data_point_ct : stats.data_point_ct; + const warning = !props.hideWarning ? WarningText(rowCount, dataPointCount) : null; + + return div( + { class: `flex-column fx-gap-1 p-3 border border-radius-2 ${props.class ?? ''}` }, + span( + span({ class: 'text-secondary' }, 'Schema: '), + stats.table_group_schema, + ), + div( + { class: 'flex-row' }, + div( + { class: 'flex-column fx-gap-1', style: 'flex: 1 1 50%;' }, + span( + span({ class: 'text-secondary' }, 'Tables: '), + formatNumber(stats.table_ct), + ), + span( + span({ class: 'text-secondary' }, 'Columns: '), + formatNumber(stats.column_ct), + ), + ), + div( + { class: 'flex-column fx-gap-1', style: 'flex: 1 1 50%;' }, + span( + span({ class: 'text-secondary' }, 'Rows: '), + formatNumber(rowCount), + useApprox ? ' *' : '', + ), + span( + span({ class: 'text-secondary' }, 'Data points: '), + formatNumber(dataPointCount), + useApprox ? ' *' : '', + ), + ), + ), + useApprox && !props.hideApproxCaption + ? span( + { class: 'text-caption text-right mt-1' }, + '* Approximate counts based on server statistics', + ) + : null, + warning + ? Alert({ type: 'warn', icon: 'warning', class: 'mt-2' }, warning) + : null, + ); +}; + +/** + * @param {number | null} rowCount + * @param {number | null} dataPointCount + * @returns {HTMLElement | null} + */ +const WarningText = (rowCount, dataPointCount) => { + if (rowCount === null) { // Unknown counts + return div(`WARNING: ${profilingWarningText}`); + } + + const rowTier = getStatTier(rowCount); + const dataPointTier = getStatTier(dataPointCount); + + if (rowTier || dataPointTier) { + let category; + if (rowTier && dataPointTier) { + category = rowTier === dataPointTier + ? [ strong(rowTier), ' of rows and data points' ] + : [ strong(rowTier), ' of rows and ', strong(dataPointTier), ' of data points' ]; + } else { + category = rowTier + ? [ strong(rowTier), ' of rows' ] + : [ strong(dataPointTier), ' of data points' ]; + } + return div( + div('WARNING: The table group has ', ...category, '.'), + div({ class: 'mt-2' }, profilingWarningText), + ); + } + return null; +} + +/** + * @param {number | null} count + * @returns {string | null} + */ +function getStatTier(/** @type number */ count) { + if (count > 1000000000) { + return 'billions'; + } else if (count > 1000000) { + return 'millions'; + } else if (count > 100000) { + return 'hundreds of thousands'; + } + return null; +}; + +export { TableGroupStats }; diff --git a/testgen/ui/components/frontend/js/components/table_group_test.js b/testgen/ui/components/frontend/js/components/table_group_test.js index bb226a45..ff987f06 100644 --- a/testgen/ui/components/frontend/js/components/table_group_test.js +++ b/testgen/ui/components/frontend/js/components/table_group_test.js @@ -1,9 +1,17 @@ /** + * @import { TableGroupStats } from './table_group_stats.js' + * + * @typedef TablePreview + * @type {object} + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number} approx_data_point_ct + * @property {boolean} can_access + * * @typedef TableGroupPreview * @type {object} - * @property {string} schema - * @property {Record?} tables - * @property {number?} column_count + * @property {TableGroupStats} stats + * @property {Record?} tables * @property {boolean?} success * @property {string?} message * @@ -12,43 +20,26 @@ * @property {(() => void)?} onVerifyAcess */ import van from '../van.min.js'; -import { emitEvent, getValue } from '../utils.js'; +import { getValue } from '../utils.js'; +import { formatNumber } from '../display_utils.js'; import { Alert } from '../components/alert.js'; import { Icon } from '../components/icon.js'; import { Button } from '../components/button.js'; +import { TableGroupStats } from './table_group_stats.js'; -const { div, span, strong } = van.tags; +const { div, span } = van.tags; /** - * - * @param {string} schema * @param {TableGroupPreview?} preview * @param {ComponentOptions} options * @returns {HTMLElement} */ -const TableGroupTest = (schema, preview, options) => { +const TableGroupTest = (preview, options) => { return div( { class: 'flex-column fx-gap-2' }, div( - { class: 'flex-row fx-justify-space-between' }, - div( - { class: 'flex-column fx-gap-2' }, - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Schema:'), - span({}, schema), - ), - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Table Count:'), - () => span({}, Object.keys(getValue(preview)?.tables ?? {})?.length ?? '--'), - ), - div( - { class: 'flex-row fx-gap-1' }, - strong({}, 'Column Count:'), - () => span({}, getValue(preview)?.column_count ?? '--'), - ), - ), + { class: 'flex-row fx-justify-space-between fx-align-flex-end' }, + span({ class: 'text-caption text-right' }, '* Approximate row counts based on server statistics'), options.onVerifyAcess ? div( { class: 'flex-row' }, @@ -62,6 +53,9 @@ const TableGroupTest = (schema, preview, options) => { ) : '', ), + () => getValue(preview) + ? TableGroupStats({ hideWarning: true, hideApproxCaption: true }, getValue(preview).stats) + : '', () => { const tableGroupPreview = getValue(preview); const wasPreviewExecuted = tableGroupPreview && typeof tableGroupPreview.success === 'boolean'; @@ -72,33 +66,44 @@ const TableGroupTest = (schema, preview, options) => { const tables = tableGroupPreview?.tables ?? {}; const hasTables = Object.keys(tables).length > 0; - const verifiedAccess = Object.values(tables).some(v => v != null); - const tableAccessWarning = Object.values(tables).some(v => v != null && v === false) + const verifiedAccess = Object.values(tables).some(({ can_access }) => can_access != null); + const tableAccessWarning = Object.values(tables).some(({ can_access }) => can_access != null && can_access === false) ? tableGroupPreview.message : ''; + const columns = ['50%', '14%', '14%', '14%', '8%']; + return div( {class: 'flex-column fx-gap-2'}, div( - { class: 'table hoverable p-3' }, + { class: 'table hoverable p-3 pb-0' }, div( - { class: 'table-header flex-row fx-justify-space-between' }, - span('Tables'), + { class: 'table-header flex-row' }, + span({ style: `flex: 1 1 ${columns[0]}; max-width: ${columns[0]};` }, 'Tables'), + span({ style: `flex: 1 1 ${columns[1]};` }, 'Columns'), + span({ style: `flex: 1 1 ${columns[2]};` }, 'Rows *'), + span({ style: `flex: 1 1 ${columns[3]};` }, 'Data Points *'), verifiedAccess - ? span({class: 'flex-row fx-justify-center', style: 'width: 100px;'}, 'Has access?') + ? span({class: 'flex-row fx-justify-center', style: `flex: 1 1 ${columns[4]};`}, 'Can access?') : '', ), div( - { class: 'flex-column', style: 'max-height: 200px; overflow-y: auto;' }, + { class: 'flex-column', style: 'max-height: 400px; overflow-y: auto;' }, hasTables - ? Object.entries(tables).map(([tableName, hasAccess]) => + ? Object.entries(tables).map(([ tableName, table ]) => div( { class: 'table-row flex-row fx-justify-space-between' }, - span(tableName), - hasAccess != null + span( + { style: `flex: 1 1 ${columns[0]}; max-width: ${columns[0]}; word-wrap: break-word;` }, + tableName, + ), + span({ style: `flex: 1 1 ${columns[1]};` }, formatNumber(table.column_ct)), + span({ style: `flex: 1 1 ${columns[2]};` }, formatNumber(table.approx_record_ct)), + span({ style: `flex: 1 1 ${columns[3]};` }, formatNumber(table.approx_data_point_ct)), + table.can_access != null ? span( - {class: 'flex-row fx-justify-center', style: 'width: 100px;'}, - hasAccess + {class: 'flex-row fx-justify-center', style: `flex: 1 1 ${columns[4]};`}, + table.can_access ? Icon({classes: 'text-green', size: 20}, 'check_circle') : Icon({classes: 'text-error', size: 20}, 'dangerous'), ) diff --git a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js index 4d51f65f..85689099 100644 --- a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js +++ b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js @@ -11,6 +11,7 @@ import van from '../van.min.js'; import { Card } from '../components/card.js'; import { Attribute } from '../components/attribute.js'; import { Button } from '../components/button.js'; +import { Alert } from '../components/alert.js'; import { SummaryBar } from '../components/summary_bar.js'; import { PercentBar } from '../components/percent_bar.js'; import { FrequencyBars } from '../components/frequency_bars.js'; @@ -24,6 +25,7 @@ const columnTypeFunctionMap = { B: BooleanColumn, D: DatetimeColumn, N: NumericColumn, + X: UnknownColumn, }; const attributeWidth = 250; const percentWidth = 250; @@ -33,16 +35,13 @@ const boxPlotWidth = 800; const ColumnDistributionCard = (/** @type Properties */ props, /** @type Column */ item) => { loadStylesheet('column-distribution', stylesheet); - const columnFunction = columnTypeFunctionMap[item.general_type]; + const displayType = item.profile_run_id && item.record_ct !== 0 ? item.general_type : 'X' + const columnFunction = columnTypeFunctionMap[displayType]; return Card({ border: props.border, title: `Value Distribution ${item.is_latest_profile ? '*' : ''}`, - content: item.profile_run_id - ? (item.record_ct === 0 - ? BaseCounts(item) - : columnFunction?.(item)) - : null, + content: columnFunction?.(item), actionContent: div( { class: 'flex-row fx-gap-3' }, item.profile_run_id @@ -68,13 +67,13 @@ const ColumnDistributionCard = (/** @type Properties */ props, /** @type Column ]) : span( { class: 'text-secondary' }, - 'No profiling data available', + 'No profiling results for column', ), ), }) }; -function AlphaColumn(/** @type ColumnProfile */ item) { +function AlphaColumn(/** @type Column */ item) { const standardPatternLabels = { STREET_ADDR: 'Street Address', STATE_USA: 'State (USA)', @@ -210,7 +209,7 @@ function AlphaColumn(/** @type ColumnProfile */ item) { ); } -function BooleanColumn(/** @type ColumnProfile */ item) { +function BooleanColumn(/** @type Column */ item) { return div( { class: 'flex-column fx-gap-5' }, BaseCounts(item), @@ -227,7 +226,7 @@ function BooleanColumn(/** @type ColumnProfile */ item) { ); } -function DatetimeColumn(/** @type ColumnProfile */ item) { +function DatetimeColumn(/** @type Column */ item) { const total = item.record_ct; return div( @@ -265,7 +264,7 @@ function DatetimeColumn(/** @type ColumnProfile */ item) { ); } -function NumericColumn(/** @type ColumnProfile */ item) { +function NumericColumn(/** @type Column */ item) { return div( { class: 'flex-column fx-gap-5' }, BaseCounts(item), @@ -309,18 +308,43 @@ function NumericColumn(/** @type ColumnProfile */ item) { ); } -const BaseCounts = (/** @type ColumnProfile */ item) => { +function UnknownColumn(/** @type Column */ item) { + return div( + { class: 'flex-column fx-gap-3' }, + BaseCounts(item), + item.profiling_error + ? Alert( + { type: 'warn', icon: 'warning' }, + div({ style: 'font-size: 14px;' }, 'Profiling encountered an error for this column.'), + div({ class: 'text-primary text-code', style: 'font-size: 12px;' }, item.profiling_error), + ) + : null, + ); +} + +const BaseCounts = (/** @type Column */ item) => { + const useApprox = item.record_ct === null; const attributes = [ - { key: 'record_ct', label: 'Record Count' }, - { key: 'value_ct', label: 'Value Count' }, + { + label: `Row Count${useApprox ? ' †' : ''}`, + value: useApprox ? item.approx_record_ct : item.record_ct, + } ]; + if (item.value_ct !== null) { + attributes.push({ label: 'Value Count', value: item.value_ct }); + } return div( - { class: 'flex-row fx-gap-4' }, - attributes.map(({ key, label }) => Attribute({ - label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: formatNumber(item[key]), - width: attributeWidth, - })), + div( + { class: 'flex-row fx-gap-4' }, + attributes.map(({ label, value }) => Attribute({ + label: value === 0 ? span({ class: 'text-error' }, label) : label, + value: formatNumber(value), + width: attributeWidth, + })), + ), + useApprox + ? div({ class: 'text-caption text-right mt-1' }, '† Approximate count based on server statistics') + : null, ); }; diff --git a/testgen/ui/components/frontend/js/data_profiling/data_issues.js b/testgen/ui/components/frontend/js/data_profiling/data_issues.js index 40265317..261a2283 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_issues.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_issues.js @@ -41,6 +41,7 @@ const STATUS_COLORS = { }; const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Column */ item) => { + const title = `Potential PII ${item.is_latest_profile ? '*' : ''}`; const attributes = [ { key: 'detail', width: 150, label: 'Type', @@ -66,12 +67,15 @@ const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Colum href: 'profiling-runs:hygiene', params: { run_id: item.profile_run_id, issue_class: 'Potential PII' }, }; - const noneContent = item.profile_run_id ? 'No potential PII detected' : null; + const noneContent = item.profile_run_id && !item.profiling_error + ? 'No potential PII detected' + : span({ class: 'text-secondary' }, `No profiling results for ${item.type}`); - return IssuesCard(props, 'Potential PII *', potentialPII, attributes, linkProps, noneContent); + return IssuesCard(props, title, potentialPII, attributes, linkProps, noneContent); }; const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { + const title = `Hygiene Issues ${item.is_latest_profile ? '*' : ''}`; const attributes = [ { key: 'anomaly_name', width: 200, label: 'Issue' }, { @@ -99,9 +103,11 @@ const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Colu column_name: item.column_name, }, }; - const noneContent = item.profile_run_id ? 'No hygiene issues detected' : null; + const noneContent = item.profile_run_id && !item.profiling_error + ? 'No hygiene issues detected' + : span({ class: 'text-secondary' }, `No profiling results for ${item.type}`); - return IssuesCard(props, 'Hygiene Issues *', hygieneIssues, attributes, linkProps, noneContent); + return IssuesCard(props, title, hygieneIssues, attributes, linkProps, noneContent); }; const TestIssuesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { diff --git a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js index e1896702..6c4c9586 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js @@ -75,12 +75,14 @@ * @property {string?} profile_run_id * @property {number?} profile_run_date * @property {boolean?} is_latest_profile + * @property {string?} profiling_error * @property {number?} has_test_runs * * Scores * @property {string?} dq_score * @property {string?} dq_score_profiling * @property {string?} dq_score_testing * * Value Counts + * @property {number?} approx_record_ct * @property {number} record_ct * @property {number} value_ct * @property {number} distinct_value_ct @@ -147,9 +149,9 @@ * @property {string} project_code * * Characteristics * @property {string} functional_table_type + * @property {number} approx_record_ct * @property {number} record_ct * @property {number} column_ct - * @property {number} data_point_ct * @property {number} add_date * @property {number} last_refresh_date * @property {number} drop_date @@ -231,10 +233,10 @@ const LatestProfilingTime = (/** @type Properties */ props, /** @type Table | Co }); if (!item.profile_run_id) { if (item.drop_date) { - text = 'No profiling results for table group'; + text = `No profiling results for ${item.type}`; link = null; } else { - text = 'No profiling results yet for table group.'; + text = `No profiling results yet for ${item.type}.`; link = Link({ href: 'table-groups', params: { project_code: item.project_code, connection_id: item.connection_id }, diff --git a/testgen/ui/components/frontend/js/data_profiling/table_size.js b/testgen/ui/components/frontend/js/data_profiling/table_size.js index 2573d9c3..af0f43b3 100644 --- a/testgen/ui/components/frontend/js/data_profiling/table_size.js +++ b/testgen/ui/components/frontend/js/data_profiling/table_size.js @@ -14,10 +14,15 @@ import { formatNumber, formatTimestamp } from '../display_utils.js'; const { div, span } = van.tags; const TableSizeCard = (/** @type Properties */ _props, /** @type Table */ item) => { + const useApprox = item.record_ct === null; + const rowCount = useApprox ? item.approx_record_ct : item.record_ct; const attributes = [ - { key: 'column_ct', label: 'Column Count' }, - { key: 'record_ct', label: 'Row Count' }, - { key: 'data_point_ct', label: 'Data Point Count' }, + { label: 'Column Count', value: item.column_ct }, + { label: `Row Count${useApprox ? ' †': ''}`, value: rowCount }, + { + label: `Data Point Count${useApprox ? ' †': ''}`, + value: rowCount !== null ? (item.column_ct * rowCount) : null, + } ]; return Card({ @@ -25,13 +30,16 @@ const TableSizeCard = (/** @type Properties */ _props, /** @type Table */ item) content: div( div( { class: 'flex-row fx-flex-wrap fx-gap-4' }, - attributes.map(({ key, label }) => Attribute({ - label: item[key] === 0 ? span({ class: 'text-error' }, label) : label, - value: formatNumber(item[key]), + attributes.map(({ label, value }) => Attribute({ + label: value === 0 ? span({ class: 'text-error' }, label) : label, + value: formatNumber(value), width: 250, })), ), - span({ class: 'text-caption flex-row fx-justify-content-flex-end mt-2' }, `** as of ${formatTimestamp(item.last_refresh_date)}`), + div({ class: 'text-caption text-right mt-1' }, `** as of ${formatTimestamp(item.last_refresh_date)}`), + useApprox + ? div({ class: 'text-caption text-right mt-1' }, '† Approximate counts based on server statistics') + : null, ), actionContent: Button({ type: 'stroked', diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index 49f073cd..4fc62ba5 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -2,18 +2,29 @@ * @import { ProjectSummary } from '../types.js'; * @import { SelectOption } from '../components/select.js'; * + * + * @typedef ProgressStep + * @type {object} + * @property {'data_chars'|'col_profiling'|'freq_analysis'|'hygiene_issues'} key + * @property {'Pending'|'Running'|'Completed'|'Warning'} status + * @property {string} label + * @property {string} detail + * * @typedef ProfilingRun * @type {object} - * @property {string} profiling_run_id - * @property {number} start_time - * @property {number} end_time + * @property {string} id + * @property {number} profiling_starttime + * @property {number} profiling_endtime * @property {string} table_groups_name * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {ProgressStep[]} progress * @property {string} log_message * @property {string} process_id - * @property {string} schema_name + * @property {string} table_group_schema * @property {number} column_ct * @property {number} table_ct + * @property {number} record_ct + * @property {number} data_point_ct * @property {number} anomaly_ct * @property {number} anomalies_definite_ct * @property {number} anomalies_likely_ct @@ -33,28 +44,37 @@ * @property {Permissions} permissions */ import van from '../van.min.js'; -import { Tooltip } from '../components/tooltip.js'; +import { withTooltip } from '../components/tooltip.js'; import { SummaryCounts } from '../components/summary_counts.js'; import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; -import { formatTimestamp, formatDuration } from '../display_utils.js'; +import { formatTimestamp, formatDuration, formatNumber } from '../display_utils.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Icon } from '../components/icon.js'; const { div, i, span, strong } = van.tags; const PAGE_SIZE = 100; const SCROLL_CONTAINER = window.top.document.querySelector('.stMain'); +const REFRESH_INTERVAL = 15000 // 15 seconds + +const progressStatusIcons = { + Pending: { color: 'grey', icon: 'more_horiz', size: 22 }, + Running: { color: 'blue', icon: 'autoplay', size: 18 }, + Completed: { color: 'green', icon: 'check', size: 24 }, + Warning: { color: 'orange', icon: 'warning', size: 20 }, +}; const ProfilingRuns = (/** @type Properties */ props) => { loadStylesheet('profilingRuns', stylesheet); Streamlit.setFrameHeight(1); window.testgen.isPage = true; - const columns = ['5%', '15%', '15%', '20%', '35%', '10%']; + const columns = ['5%', '15%', '20%', '20%', '30%', '10%']; const userCanEdit = getValue(props.permissions)?.can_edit ?? false; const pageIndex = van.state(0); @@ -62,13 +82,24 @@ const ProfilingRuns = (/** @type Properties */ props) => { pageIndex.val = 0; return getValue(props.profiling_runs); }); - const paginatedRuns = van.derive(() => profilingRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1))); + let refreshIntervalId = null; + + const paginatedRuns = van.derive(() => { + const paginated = profilingRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1)); + const hasActiveRuns = paginated.some(({ status }) => status === 'Running'); + if (!refreshIntervalId && hasActiveRuns) { + refreshIntervalId = setInterval(() => emitEvent('RefreshData', {}), REFRESH_INTERVAL); + } else if (refreshIntervalId && !hasActiveRuns) { + clearInterval(refreshIntervalId); + } + return paginated; + }); const selectedRuns = {}; const initializeSelectedStates = (items) => { for (const profilingRun of items) { - if (selectedRuns[profilingRun.profiling_run_id] == undefined) { - selectedRuns[profilingRun.profiling_run_id] = van.state(false); + if (selectedRuns[profilingRun.id] == undefined) { + selectedRuns[profilingRun.id] = van.state(false); } } }; @@ -80,19 +111,18 @@ const ProfilingRuns = (/** @type Properties */ props) => { resizeFrameHeightOnDOMChange(wrapperId); return div( - { id: wrapperId }, + { id: wrapperId, class: 'tg-profiling-runs' }, () => { const projectSummary = getValue(props.project_summary); return projectSummary.profiling_run_count > 0 ? div( - { class: 'tg-profiling-runs' }, Toolbar(props, userCanEdit), () => profilingRuns.val.length ? div( div( - { class: 'table' }, + { class: 'table pb-0' }, () => { - const selectedItems = profilingRuns.val.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const selectedItems = profilingRuns.val.filter(i => selectedRuns[i.id]?.val ?? false); const someRunSelected = selectedItems.length > 0; const tooltipText = !someRunSelected ? 'No runs selected' : undefined; @@ -112,7 +142,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { tooltipPosition: 'bottom-left', disabled: !someRunSelected, width: 'auto', - onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.profiling_run_id) }), + onclick: () => emitEvent('RunsDeleted', { payload: selectedItems.map(i => i.id) }), }), ); }, @@ -120,7 +150,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { { class: 'table-header flex-row' }, () => { const items = profilingRuns.val; - const selectedItems = items.filter(i => selectedRuns[i.profiling_run_id]?.val ?? false); + const selectedItems = items.filter(i => selectedRuns[i.id]?.val ?? false); const allSelected = selectedItems.length === items.length; const partiallySelected = selectedItems.length > 0 && selectedItems.length < items.length; @@ -134,7 +164,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { ? Checkbox({ checked: allSelected, indeterminate: partiallySelected, - onChange: (checked) => items.forEach(item => selectedRuns[item.profiling_run_id].val = checked), + onChange: (checked) => items.forEach(item => selectedRuns[item.id].val = checked), testId: 'select-all-profiling-run', }) : '', @@ -153,7 +183,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { 'Schema', ), span( - { style: `flex: ${columns[4]}` }, + { style: `flex: ${columns[4]}`, class: 'tg-profiling-runs--issues' }, 'Hygiene Issues', ), span( @@ -162,7 +192,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { ), ), div( - paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.profiling_run_id], userCanEdit)), + paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.id], userCanEdit)), ), ), Paginator({ @@ -192,7 +222,7 @@ const Toolbar = ( /** @type boolean */ userCanEdit, ) => { return div( - { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4' }, + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4 fx-flex-wrap' }, () => Select({ label: 'Table Group', value: getValue(props.table_group_options)?.find((op) => op.selected)?.value ?? null, @@ -243,6 +273,8 @@ const ProfilingRunItem = ( /** @type boolean */ selected, /** @type boolean */ userCanEdit, ) => { + const runningStep = item.progress?.find((item) => item.status === 'Running'); + return div( { class: 'table-row flex-row', 'data-testid': 'profiling-run-item' }, userCanEdit @@ -257,49 +289,79 @@ const ProfilingRunItem = ( : '', div( { style: `flex: ${columns[1]}` }, - div({ 'data-testid': 'profiling-run-item-starttime' }, formatTimestamp(item.start_time)), + div({ 'data-testid': 'profiling-run-item-starttime' }, formatTimestamp(item.profiling_starttime)), div( { class: 'text-caption mt-1', 'data-testid': 'profiling-run-item-tablegroup' }, item.table_groups_name, ), ), div( - { class: 'flex-row', style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[2]}` }, div( + { class: 'flex-row' }, ProfilingRunStatus(item), - div( + item.status === 'Running' && item.process_id && userCanEdit ? Button({ + type: 'stroked', + label: 'Cancel', + style: 'width: 64px; height: 28px; color: var(--purple); margin-left: 12px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + item.profiling_endtime + ? div( { class: 'text-caption mt-1', 'data-testid': 'profiling-run-item-duration' }, - formatDuration(item.start_time, item.end_time), + formatDuration(item.profiling_starttime, item.profiling_endtime), + ) + : div( + { class: 'text-caption mt-1' }, + item.status === 'Running' && runningStep + ? [ + div( + runningStep.label, + withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle;' }, 'info'), + { text: ProgressTooltip(item) }, + ), + ), + div(runningStep.detail), + ] + : '--', ), - ), - item.status === 'Running' && item.process_id && userCanEdit ? Button({ - type: 'stroked', - label: 'Cancel Run', - style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', - onclick: () => emitEvent('RunCanceled', { payload: item }), - }) : null, ), div( { style: `flex: ${columns[3]}` }, - div({ 'data-testid': 'profiling-run-item-schema' }, item.schema_name), + div({ 'data-testid': 'profiling-run-item-schema' }, item.table_group_schema), div( { class: 'text-caption mt-1 mb-1', style: item.status === 'Complete' && !item.column_ct ? 'color: var(--red);' : '', 'data-testid': 'profiling-run-item-counts', }, - item.status === 'Complete' ? `${item.table_ct || 0} tables, ${item.column_ct || 0} columns` : null, + item.column_ct !== null + ? div( + `${formatNumber(item.table_ct || 0)} tables, ${formatNumber(item.column_ct || 0)} columns`, + item.record_ct !== null ? + withTooltip( + Icon({ style: 'font-size: 16px; margin-left: 4px; vertical-align: middle;' }, 'more' ), + { text: [ + div(`${formatNumber(item.record_ct || 0)} records`), + div(`${formatNumber(item.data_point_ct || 0)} data points`), + ] }, + ) + : null, + ) + : null, ), - item.column_ct ? Link({ + item.status === 'Complete' && item.column_ct ? Link({ label: 'View results', href: 'profiling-runs:results', - params: { 'run_id': item.profiling_run_id }, + params: { 'run_id': item.id }, underline: true, right_icon: 'chevron_right', }) : null, ), div( - { class: 'pr-3', style: `flex: ${columns[4]}` }, + { class: 'pr-3 tg-profiling-runs--issues', style: `flex: ${columns[4]}` }, item.anomaly_ct ? SummaryCounts({ items: [ { label: 'Definite', value: item.anomalies_definite_ct, color: 'red' }, @@ -311,7 +373,7 @@ const ProfilingRunItem = ( item.anomaly_ct ? Link({ label: `View ${item.anomaly_ct} issues`, href: 'profiling-runs:hygiene', - params: { 'run_id': item.profiling_run_id }, + params: { 'run_id': item.id }, underline: true, right_icon: 'chevron_right', style: 'margin-top: 4px;', @@ -327,7 +389,7 @@ const ProfilingRunItem = ( ); } -function ProfilingRunStatus(/** @type ProfilingRun */ item) { +const ProfilingRunStatus = (/** @type ProfilingRun */ item) => { const attributeMap = { Running: { label: 'Running', color: 'blue' }, Complete: { label: 'Completed', color: '' }, @@ -335,6 +397,7 @@ function ProfilingRunStatus(/** @type ProfilingRun */ item) { Cancelled: { label: 'Canceled', color: 'purple' }, }; const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + const hasProgressError = item.progress?.some(({error}) => !!error); return span( { class: 'flex-row', @@ -342,21 +405,41 @@ function ProfilingRunStatus(/** @type ProfilingRun */ item) { 'data-testid': 'profiling-run-item-status' }, attributes.label, - () => { - const tooltipError = van.state(false); - return item.status === 'Error' && item.log_message ? i( - { - class: 'material-symbols-rounded text-secondary ml-1 profiling-runs--info', - style: 'position: relative; font-size: 16px;', - onmouseenter: () => tooltipError.val = true, - onmouseleave: () => tooltipError.val = false, - }, - 'info', - Tooltip({ text: item.log_message, show: tooltipError }), - ) : null; - }, + item.status === 'Complete' && hasProgressError + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle; color: var(--orange);' }, 'warning' ), + { text: ProgressTooltip(item) }, + ) + : null, + item.status === 'Error' && item.log_message + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px;' }, 'info'), + { text: item.log_message, width: 250, style: 'word-break: break-word;' }, + ) + : null, ); -} +}; + +const ProgressTooltip = (/** @type ProfilingRun */ item) => { + return div( + { class: 'flex-column fx-gap-1' }, + item.progress?.map(step => { + const stepIcon = progressStatusIcons[step.status]; + return div( + { class: 'flex-row fx-gap-1' }, + Icon( + { style: `font-size: ${stepIcon.size}px; color: var(--${stepIcon.color}); min-width: 24px;` }, + stepIcon.icon, + ), + div( + { class: 'flex-column fx-align-flex-start text-left' }, + span(`${step.label}${step.detail ? (': ' + step.detail) : ''}`), + span({ style: 'font-size: 12px; opacity: 0.6; margin-top: 2px;' }, step.error), + ), + ); + }), + ); +}; const ConditionalEmptyState = ( /** @type ProjectSummary */ projectSummary, @@ -408,7 +491,11 @@ const ConditionalEmptyState = ( const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-profiling-runs { - min-height: 500px; + min-height: 550px; +} + +.tg-profiling-runs--issues { + min-width: 310px; } `); diff --git a/testgen/ui/components/frontend/js/pages/project_dashboard.js b/testgen/ui/components/frontend/js/pages/project_dashboard.js index d3f5b2ec..1eeee31a 100644 --- a/testgen/ui/components/frontend/js/pages/project_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/project_dashboard.js @@ -6,14 +6,17 @@ * @type {object} * @property {string} id * @property {string} table_groups_name + * @property {number} table_ct + * @property {number} column_ct + * @property {number} approx_record_ct + * @property {number} record_ct + * @property {number} approx_data_point_ct + * @property {number} data_point_ct * @property {string?} dq_score * @property {string?} dq_score_profiling * @property {string?} dq_score_testing * @property {string?} latest_profile_id * @property {number?} latest_profile_start - * @property {number} latest_profile_table_ct - * @property {number} latest_profile_column_ct - * @property {number} latest_profile_data_point_ct * @property {number} latest_anomalies_ct * @property {number} latest_anomalies_definite_ct * @property {number} latest_anomalies_likely_ct @@ -124,6 +127,7 @@ const ProjectDashboard = (/** @type Properties */ props) => { } const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { + const useApprox = tableGroup.record_ct === null || tableGroup.record_ct === undefined; return Card({ testId: 'table-group-summary-card', border: true, @@ -139,9 +143,12 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { ), span( { class: 'text-caption mt-1 mb-3 tg-overview--subtitle' }, - `${formatNumber(tableGroup.latest_profile_table_ct ?? 0)} tables | - ${formatNumber(tableGroup.latest_profile_column_ct ?? 0)} columns | - ${formatNumber(tableGroup.latest_profile_data_point_ct ?? 0)} data points`, + `${formatNumber(tableGroup.table_ct ?? 0)} tables | + ${formatNumber(tableGroup.column_ct ?? 0)} columns | + ${formatNumber(useApprox ? tableGroup.approx_record_ct : tableGroup.record_ct)} rows + ${useApprox ? '*' : ''} | + ${formatNumber(useApprox ? tableGroup.approx_data_point_ct : tableGroup.data_point_ct)} data points + ${useApprox ? '*' : ''}`, ), TableGroupTestSuiteSummary(tableGroup.test_suites), ), @@ -149,6 +156,9 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { ), hr({ class: 'tg-overview--table-group-divider' }), TableGroupLatestProfile(tableGroup), + useApprox + ? span({ class: 'text-caption text-right' }, '* Approximate counts based on server statistics') + : null, ) }); }; diff --git a/testgen/ui/components/frontend/js/pages/quality_dashboard.js b/testgen/ui/components/frontend/js/pages/quality_dashboard.js index 06070158..371c9ce8 100644 --- a/testgen/ui/components/frontend/js/pages/quality_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/quality_dashboard.js @@ -38,12 +38,23 @@ const QualityDashboard = (/** @type {Properties} */ props) => { const sortedBy = van.state('name'); const filterTerm = van.state(''); + + const scoreToNumber = (score) => score ? (score.startsWith('>') ? 99.99 : Number(score)) : 101; + const sortFunctions = { + name: (a, b) => caseInsensitiveSort(a.name, b.name), + score: (a, b) => { + const scoreA = Math.min(scoreToNumber(a.score), scoreToNumber(a.cde_score)); + const scoreB = Math.min(scoreToNumber(b.score), scoreToNumber(b.cde_score)); + return scoreA - scoreB; + }, + }; + const scores = van.derive(() => { const sort = getValue(sortedBy) ?? 'name'; const filter = getValue(filterTerm) ?? ''; return getValue(props.scores) .filter(score => caseInsensitiveIncludes(score.name, filter)) - .sort((a, b) => caseInsensitiveSort(a[sort], b[sort])); + .sort(sortFunctions[sort]); }); return div( diff --git a/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js index f5fd0f1e..59c17a17 100644 --- a/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js +++ b/testgen/ui/components/frontend/js/pages/run_profiling_dialog.js @@ -1,14 +1,17 @@ /** - * @import { TableGroup } from '../components/table_group_form.js'; + * @import { TableGroupStats } from '../components/table_group_stats.js' * * @typedef Result * @type {object} * @property {boolean} success * @property {string?} message + * @property {boolean?} show_link * * @typedef Properties * @type {object} - * @property {TableGroup} table_group + * @property {TableGroupStats[]} table_groups + * @property {string} selected_id + * @property {boolean} allow_selection * @property {Result?} result */ import van from '../van.min.js'; @@ -16,80 +19,108 @@ import { Streamlit } from '../streamlit.js'; import { Alert } from '../components/alert.js'; import { ExpanderToggle } from '../components/expander_toggle.js'; import { Icon } from '../components/icon.js'; -import { emitEvent, getValue, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; +import { emitEvent, getValue, loadStylesheet, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; import { Code } from '../components/code.js'; import { Button } from '../components/button.js'; +import { Select } from '../components/select.js'; +import { TableGroupStats } from '../components/table_group_stats.js'; -const { div, em, span, strong } = van.tags; +const { div, span, strong } = van.tags; /** * @param {Properties} props */ const RunProfilingDialog = (props) => { + loadStylesheet('run-profiling', stylesheet); Streamlit.setFrameHeight(1); window.testgen.isPage = true; - const wrapperId = 'runprogiling-wrapper'; + const wrapperId = 'run-profiling-wrapper'; resizeFrameHeightToElement(wrapperId); resizeFrameHeightOnDOMChange(wrapperId); - const tableGroup = getValue(props.table_group); + const tableGroups = getValue(props.table_groups); + const allowSelection = getValue(props.allow_selection); + const selectedId = van.state(getValue(props.selected_id)); + const selectedTableGroup = van.derive(() => tableGroups.find(({ id }) => id === selectedId.val)); const showCLICommand = van.state(false); return div( - { id: wrapperId, class: 'flex-column fx-gap-3' }, + { id: wrapperId }, div( - { class: 'flex-row fx-gap-1' }, - span({}, 'Execute profiling for the table group'), - strong({}, tableGroup.table_groups_name), - span({}, '?'), - ), - div( - { class: 'flex-row fx-gap-1' }, - Icon({}, 'info'), - em({}, ' Profiling will be performed in a background process.'), - ), - ExpanderToggle({ - collapseLabel: 'Collapse', - expandLabel: 'Show CLI command', - onCollapse: () => showCLICommand.val = false, - onExpand: () => showCLICommand.val = true, - }), - Code({ class: () => showCLICommand.val ? '' : 'hidden' }, `testgen run-profile --table-group-id ${tableGroup.id}`), - () => { - const result = getValue(props.result) ?? {}; - return result.message - ? Alert({ type: result.success ? 'success' : 'error' }, span(result.message)) - : ''; - }, - div( - { class: 'flex-row fx-justify-content-flex-end' }, + { class: `flex-column fx-gap-3 ${allowSelection ? 'run-profiling--allow-selection' : ''}` }, + allowSelection + ? Select({ + label: 'Table Group', + value: selectedId, + options: tableGroups.map(({ id, table_groups_name }) => ({ label: table_groups_name, value: id })), + portalClass: 'run-profiling--select', + }) + : span( + 'Run profiling for the table group ', + strong({}, selectedTableGroup.val.table_groups_name), + '?', + ), + () => selectedTableGroup.val + ? div( + TableGroupStats({ class: 'mt-1 mb-3' }, selectedTableGroup.val), + ExpanderToggle({ + collapseLabel: 'Collapse', + expandLabel: 'Show CLI command', + onCollapse: () => showCLICommand.val = false, + onExpand: () => showCLICommand.val = true, + }), + Code({ class: () => showCLICommand.val ? '' : 'hidden' }, `testgen run-profile --table-group-id ${selectedTableGroup.val.id}`), + ) + : div({ style: 'margin: auto;' }, 'Select a table group to profile.'), () => { - const result = getValue(props.result); - - if (result && result.success) { - return Button({ - type: 'stroked', - color: 'primary', - label: 'Go to Profiling Runs', - width: 'auto', - icon: 'chevron_right', - onclick: () => emitEvent('GoToProfilingRunsClicked', { payload: tableGroup.id }), - }); - } - - return Button({ + const result = getValue(props.result) ?? {}; + return result.message + ? Alert({ type: result.success ? 'success' : 'error' }, span(result.message)) + : ''; + }, + ), + () => !getValue(props.result) + ? div( + { class: 'flex-row fx-justify-space-between mt-3' }, + div( + { class: 'flex-row fx-gap-1' }, + Icon({ size: 16 }, 'info'), + span({ class: 'text-caption' }, ' Profiling will be performed in a background process.'), + ), + Button({ label: 'Run Profiling', type: 'stroked', color: 'primary', width: 'auto', style: 'width: auto;', - onclick: () => emitEvent('RunProfilingConfirmed', { payload: tableGroup.id }), - }); - } - ) + disabled: !selectedTableGroup.val, + onclick: () => emitEvent('RunProfilingConfirmed', { payload: selectedTableGroup.val }), + }), + ) : '', + () => getValue(props.result)?.show_link + ? Button({ + type: 'stroked', + color: 'primary', + label: 'Go to Profiling Runs', + style: 'width: auto; margin-left: auto; margin-top: 12px;', + icon: 'chevron_right', + onclick: () => emitEvent('GoToProfilingRunsClicked', { payload: selectedTableGroup.val.id }), + }) + : '', ); }; +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.run-profiling--allow-selection { + min-height: 225px; +} + +.run-profiling--select { + max-height: 200px !important; +} +`); + export { RunProfilingDialog }; \ No newline at end of file diff --git a/testgen/ui/components/frontend/js/pages/table_group_wizard.js b/testgen/ui/components/frontend/js/pages/table_group_wizard.js index 916506ad..074bff2b 100644 --- a/testgen/ui/components/frontend/js/pages/table_group_wizard.js +++ b/testgen/ui/components/frontend/js/pages/table_group_wizard.js @@ -23,6 +23,7 @@ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; import { TableGroupForm } from '../components/table_group_form.js'; import { TableGroupTest } from '../components/table_group_test.js'; +import { TableGroupStats } from '../components/table_group_stats.js'; import { emitEvent, getValue, resizeFrameHeightOnDOMChange, resizeFrameHeightToElement } from '../utils.js'; import { Button } from '../components/button.js'; import { Alert } from '../components/alert.js'; @@ -147,7 +148,6 @@ const TableGroupWizard = (props) => { } return TableGroupTest( - tableGroup.table_group_schema ?? '--', props.table_group_preview, { onVerifyAcess: () => { @@ -175,6 +175,7 @@ const TableGroupWizard = (props) => { return RunProfilingStep( stepsState.tableGroup.rawVal, runProfiling, + props.table_group_preview, results?.success ?? false, ); }); @@ -246,10 +247,11 @@ const TableGroupWizard = (props) => { /** * @param {object} tableGroup * @param {boolean} runProfiling + * @param {TableGroupPreview?} preview * @param {boolean?} disabled * @returns */ -const RunProfilingStep = (tableGroup, runProfiling, disabled) => { +const RunProfilingStep = (tableGroup, runProfiling, preview, disabled) => { return div( { class: 'flex-column fx-gap-3' }, Checkbox({ @@ -263,12 +265,18 @@ const RunProfilingStep = (tableGroup, runProfiling, disabled) => { disabled: disabled ?? false, onChange: (value) => runProfiling.val = value, }), + () => runProfiling.val && preview.val + ? TableGroupStats({ class: 'mt-1 mb-1' }, preview.val.stats) + : '', div( { class: 'flex-row fx-gap-1' }, - Icon({}, 'info'), - () => runProfiling.val - ? i('Profiling will be performed in a background process.') - : i('Profiling will be skipped. You can run this step later from the Profiling Runs page.'), + Icon({ size: 16 }, 'info'), + span( + { class: 'text-caption' }, + () => runProfiling.val + ? 'Profiling will be performed in a background process.' + : 'Profiling will be skipped. You can run this step later from the Profiling Runs page.', + ), ), ); }; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 8b148f99..b9796462 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -2,6 +2,13 @@ * @import { ProjectSummary } from '../types.js'; * @import { SelectOption } from '../components/select.js'; * + * @typedef ProgressStep + * @type {object} + * @property {'data_chars'|'validation'|'QUERY'|'CAT'|'METADATA'} key + * @property {'Pending'|'Running'|'Completed'|'Warning'} status + * @property {string} label + * @property {string} detail + * * @typedef TestRun * @type {object} * @property {string} test_run_id @@ -10,6 +17,7 @@ * @property {string} table_groups_name * @property {string} test_suite * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {ProgressStep[]} progress * @property {string} log_message * @property {string} process_id * @property {number} test_ct @@ -34,7 +42,7 @@ * @property {Permissions} permissions */ import van from '../van.min.js'; -import { Tooltip } from '../components/tooltip.js'; +import { withTooltip } from '../components/tooltip.js'; import { SummaryBar } from '../components/summary_bar.js'; import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; @@ -45,10 +53,19 @@ import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; +import { Icon } from '../components/icon.js'; const { div, i, span, strong } = van.tags; const PAGE_SIZE = 100; const SCROLL_CONTAINER = window.top.document.querySelector('.stMain'); +const REFRESH_INTERVAL = 15000 // 15 seconds + +const progressStatusIcons = { + Pending: { color: 'grey', icon: 'more_horiz', size: 22 }, + Running: { color: 'blue', icon: 'autoplay', size: 18 }, + Completed: { color: 'green', icon: 'check', size: 24 }, + Warning: { color: 'orange', icon: 'warning', size: 20 }, +}; const TestRuns = (/** @type Properties */ props) => { loadStylesheet('testRuns', stylesheet); @@ -63,7 +80,18 @@ const TestRuns = (/** @type Properties */ props) => { pageIndex.val = 0; return getValue(props.test_runs); }); - const paginatedRuns = van.derive(() => testRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1))); + let refreshIntervalId = null; + + const paginatedRuns = van.derive(() => { + const paginated = testRuns.val.slice(PAGE_SIZE * pageIndex.val, PAGE_SIZE * (pageIndex.val + 1)); + const hasActiveRuns = paginated.some(({ status }) => status === 'Running'); + if (!refreshIntervalId && hasActiveRuns) { + refreshIntervalId = setInterval(() => emitEvent('RefreshData', {}), REFRESH_INTERVAL); + } else if (refreshIntervalId && !hasActiveRuns) { + clearInterval(refreshIntervalId); + } + return paginated; + }); const selectedRuns = {}; const initializeSelectedStates = (items) => { @@ -81,17 +109,16 @@ const TestRuns = (/** @type Properties */ props) => { resizeFrameHeightOnDOMChange(wrapperId); return div( - { id: wrapperId }, + { id: wrapperId, class: 'tg-test-runs' }, () => { const projectSummary = getValue(props.project_summary); return projectSummary.test_run_count > 0 ? div( - { class: 'tg-test-runs' }, Toolbar(props, userCanEdit), () => testRuns.val.length ? div( div( - { class: 'table' }, + { class: 'table pb-0' }, () => { const selectedItems = testRuns.val.filter(i => selectedRuns[i.test_run_id]?.val ?? false); const someRunSelected = selectedItems.length > 0; @@ -189,7 +216,7 @@ const Toolbar = ( /** @type boolean */ userCanEdit, ) => { return div( - { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4' }, + { class: 'flex-row fx-align-flex-end fx-justify-space-between mb-4 fx-gap-4 fx-flex-wrap' }, div( { class: 'flex-row fx-gap-4' }, () => Select({ @@ -252,6 +279,8 @@ const TestRunItem = ( /** @type boolean */ selected, /** @type boolean */ userCanEdit, ) => { + const runningStep = item.progress?.find((item) => item.status === 'Running'); + return div( { class: 'table-row flex-row' }, userCanEdit @@ -278,20 +307,37 @@ const TestRunItem = ( ), ), div( - { class: 'flex-row', style: `flex: ${columns[2]}` }, + { style: `flex: ${columns[2]}` }, div( + { class: 'flex-row' }, TestRunStatus(item), - div( + item.status === 'Running' && item.process_id && userCanEdit ? Button({ + type: 'stroked', + label: 'Cancel', + style: 'width: 64px; height: 28px; color: var(--purple); margin-left: 12px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + item.test_endtime + ? div( { class: 'text-caption mt-1' }, formatDuration(item.test_starttime, item.test_endtime), + ) + : div( + { class: 'text-caption mt-1' }, + item.status === 'Running' && runningStep + ? [ + div( + runningStep.label, + withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle;' }, 'info'), + { text: ProgressTooltip(item) }, + ), + ), + div(runningStep.detail), + ] + : '--', ), - ), - item.status === 'Running' && item.process_id && userCanEdit ? Button({ - type: 'stroked', - label: 'Cancel Run', - style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', - onclick: () => emitEvent('RunCanceled', { payload: item }), - }) : null, ), div( { class: 'pr-3', style: `flex: ${columns[3]}` }, @@ -315,9 +361,9 @@ const TestRunItem = ( : '--', ), ); -} +}; -function TestRunStatus(/** @type TestRun */ item) { +const TestRunStatus = (/** @type TestRun */ item) => { const attributeMap = { Running: { label: 'Running', color: 'blue' }, Complete: { label: 'Completed', color: '' }, @@ -325,27 +371,48 @@ function TestRunStatus(/** @type TestRun */ item) { Cancelled: { label: 'Canceled', color: 'purple' }, }; const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + const hasProgressError = item.progress?.some(({error}) => !!error); return span( { class: 'flex-row', style: `color: var(--${attributes.color});`, }, attributes.label, - () => { - const tooltipError = van.state(false); - return item.status === 'Error' && item.log_message ? i( - { - class: 'material-symbols-rounded text-secondary ml-1', - style: 'position: relative; font-size: 16px;', - onmouseenter: () => tooltipError.val = true, - onmouseleave: () => tooltipError.val = false, - }, - 'info', - Tooltip({ text: item.log_message, show: tooltipError }), - ) : null; - }, + item.status === 'Complete' && hasProgressError + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px; vertical-align: middle; color: var(--orange);' }, 'warning' ), + { text: ProgressTooltip(item) }, + ) + : null, + item.status === 'Error' && item.log_message + ? withTooltip( + Icon({ style: 'font-size: 18px; margin-left: 4px;' }, 'info'), + { text: item.log_message, width: 250, style: 'word-break: break-word;' }, + ) + : null, ); -} +}; + +const ProgressTooltip = (/** @type TestRun */ item) => { + return div( + { class: 'flex-column fx-gap-1' }, + item.progress?.map(step => { + const stepIcon = progressStatusIcons[step.status]; + return div( + { class: 'flex-row fx-gap-1' }, + Icon( + { style: `font-size: ${stepIcon.size}px; color: var(--${stepIcon.color}); min-width: 24px;` }, + stepIcon.icon, + ), + div( + { class: 'flex-column fx-align-flex-start text-left' }, + span(`${step.label}${step.detail ? (': ' + step.detail) : ''}`), + span({ style: 'font-size: 12px; opacity: 0.6; margin-top: 2px; white-space: pre-wrap;' }, step.error), + ), + ); + }), + ); +}; const ConditionalEmptyState = ( /** @type ProjectSummary */ projectSummary, @@ -406,7 +473,7 @@ const ConditionalEmptyState = ( const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-test-runs { - min-height: 500px; + min-height: 550px; } `); diff --git a/testgen/ui/components/widgets/download_dialog.py b/testgen/ui/components/widgets/download_dialog.py index 0a43a748..712eeaa0 100644 --- a/testgen/ui/components/widgets/download_dialog.py +++ b/testgen/ui/components/widgets/download_dialog.py @@ -1,5 +1,6 @@ import tempfile from collections.abc import Callable, Iterable +from datetime import datetime from io import BytesIO from typing import TypedDict from zipfile import ZipFile @@ -54,7 +55,11 @@ def get_excel_file_data( # Timestamp worksheet.write("A3", "Exported on", details_key_format) - worksheet.write("B3", date_service.get_timezoned_now(st.session_state), details_value_format) + worksheet.write( + "B3", + date_service.get_timezoned_timestamp(st.session_state, datetime.utcnow()), + details_value_format, + ) # Details if details: diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 89119ef9..a128a241 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -92,6 +92,7 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, -- Profile Run profile_run_id::VARCHAR, run_date AS profile_run_date, + query_error AS profiling_error, {COLUMN_PROFILING_FIELDS}, -- Extra fields for sorting and exporting position, @@ -103,7 +104,8 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, WHERE profile_run_id = profile_results.profile_run_id AND table_name = profile_results.table_name AND column_name = profile_results.column_name - ) THEN 'Yes' END AS hygiene_issues + ) THEN 'Yes' END AS hygiene_issues, + CASE WHEN query_error IS NOT NULL THEN 'Error: ' || query_error ELSE NULL END AS result_details FROM profile_results WHERE profile_run_id = :profiling_run_id AND table_name ILIKE :table_name @@ -200,9 +202,9 @@ def get_tables_by_condition( table_chars.table_groups_id::VARCHAR AS table_group_id, -- Characteristics functional_table_type, - record_ct, + approx_record_ct, + table_chars.record_ct, table_chars.column_ct, - data_point_ct, add_date, last_refresh_date, drop_date, @@ -368,6 +370,7 @@ def get_columns_by_condition( column_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, run_date AS profile_run_date, TRUE AS is_latest_profile, + query_error AS profiling_error, {""" -- Has Test Runs EXISTS( @@ -394,12 +397,13 @@ def get_columns_by_condition( column_chars.dq_score_profiling, column_chars.dq_score_testing, """ if include_scores else ""} + table_chars.approx_record_ct, {COLUMN_PROFILING_FIELDS} FROM data_column_chars column_chars - {""" LEFT JOIN data_table_chars table_chars ON ( column_chars.table_id = table_chars.table_id ) + {""" LEFT JOIN table_groups ON ( column_chars.table_groups_id = table_groups.id ) diff --git a/testgen/ui/queries/scoring_queries.py b/testgen/ui/queries/scoring_queries.py index 38b7387e..d16243ab 100644 --- a/testgen/ui/queries/scoring_queries.py +++ b/testgen/ui/queries/scoring_queries.py @@ -8,10 +8,11 @@ @st.cache_data(show_spinner="Loading data :gray[:small[(This might take a few minutes)]] ...") def get_all_score_cards(project_code: str) -> list["ScoreCard"]: - return [ - definition.as_cached_score_card() - for definition in ScoreDefinition.all(project_code=project_code) + results = [ + definition.as_cached_score_card(include_definition=True) + for definition in ScoreDefinition.all(project_code=project_code, last_history_items=50) ] + return results def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list[dict]: diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py index 45afd5d7..49348626 100644 --- a/testgen/ui/queries/source_data_queries.py +++ b/testgen/ui/queries/source_data_queries.py @@ -5,7 +5,7 @@ import pandas as pd import streamlit as st -from testgen.common.clean_sql import ConcatColumnList +from testgen.common.clean_sql import concat_columns from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.models.connection import Connection, SQLFlavor from testgen.common.models.test_definition import TestDefinition @@ -14,9 +14,10 @@ from testgen.utils import to_dataframe LOG = logging.getLogger("testgen") +DEFAULT_LIMIT = 500 -def get_hygiene_issue_source_query(issue_data: dict) -> str: +def get_hygiene_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> str: def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str], sql_flavor: SQLFlavor) -> str: if test_id in {"1019", "1020"}: start_index = detail_exp.find("Columns: ") @@ -62,6 +63,9 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str] "COLUMN_NAME": issue_data["column_name"], "DETAIL_EXPRESSION": issue_data["detail"], "PROFILE_RUN_DATE": issue_data["profiling_starttime"], + "LIMIT": limit, + "LIMIT_2": int(limit/2), + "LIMIT_4": int(limit/4), } lookup_query = replace_params(lookup_query, params) @@ -72,10 +76,11 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str] @st.cache_data(show_spinner=False) def get_hygiene_issue_source_data( issue_data: dict, - limit: int | None = None, + limit: int = DEFAULT_LIMIT, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: + lookup_query = None try: - lookup_query = get_hygiene_issue_source_query(issue_data) + lookup_query = get_hygiene_issue_source_query(issue_data, limit) if not lookup_query: return "NA", "Source data lookup is not available for this hygiene issue.", None, None @@ -99,7 +104,7 @@ def get_hygiene_issue_source_data( return "ERR", f"Source data lookup encountered an error:\n\n{e.args[0]}", lookup_query, None -def get_test_issue_source_query(issue_data: dict) -> str: +def get_test_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> str: lookup_data = _get_lookup_data(issue_data["table_groups_id"], issue_data["test_type_id"], "Test Results") if not lookup_data or not lookup_data.lookup_query: return None @@ -111,7 +116,7 @@ def get_test_issue_source_query(issue_data: dict) -> str: params = { "TARGET_SCHEMA": issue_data["schema_name"], "TABLE_NAME": issue_data["table_name"], - "COLUMN_NAME": issue_data["column_names"], + "COLUMN_NAME": issue_data["column_names"], # Don't quote this - queries already have quotes "COLUMN_TYPE": issue_data["column_type"], "TEST_DATE": str(issue_data["test_date"]), "CUSTOM_QUERY": test_definition.custom_query, @@ -124,18 +129,21 @@ def get_test_issue_source_query(issue_data: dict) -> str: "THRESHOLD_VALUE": test_definition.threshold_value, "SUBSET_CONDITION": test_definition.subset_condition or "1=1", "GROUPBY_NAMES": test_definition.groupby_names, - "HAVING_CONDITION": test_definition.having_condition, + "HAVING_CONDITION": f"HAVING {test_definition.having_condition}" if test_definition.having_condition else "", "MATCH_SCHEMA_NAME": test_definition.match_schema_name, "MATCH_TABLE_NAME": test_definition.match_table_name, "MATCH_COLUMN_NAMES": test_definition.match_column_names, "MATCH_SUBSET_CONDITION": test_definition.match_subset_condition or "1=1", "MATCH_GROUPBY_NAMES": test_definition.match_groupby_names, - "MATCH_HAVING_CONDITION": test_definition.match_having_condition, + "MATCH_HAVING_CONDITION": f"HAVING {test_definition.match_having_condition}" if test_definition.having_condition else "", "COLUMN_NAME_NO_QUOTES": issue_data["column_names"], "WINDOW_DATE_COLUMN": test_definition.window_date_column, "WINDOW_DAYS": test_definition.window_days, - "CONCAT_COLUMNS": ConcatColumnList(issue_data["column_names"], ""), - "CONCAT_MATCH_GROUPBY": ConcatColumnList(test_definition.match_groupby_names, ""), + "CONCAT_COLUMNS": concat_columns(issue_data["column_names"], ""), + "CONCAT_MATCH_GROUPBY": concat_columns(test_definition.match_groupby_names, ""), + "LIMIT": limit, + "LIMIT_2": int(limit/2), + "LIMIT_4": int(limit/4), } lookup_query = replace_params(lookup_data.lookup_query, params) @@ -146,14 +154,15 @@ def get_test_issue_source_query(issue_data: dict) -> str: @st.cache_data(show_spinner=False) def get_test_issue_source_data( issue_data: dict, - limit: int | None = None, + limit: int = DEFAULT_LIMIT, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: + lookup_query = None try: test_definition = TestDefinition.get(issue_data["test_definition_id_current"]) if not test_definition: return "NA", "Test definition no longer exists.", None, None - lookup_query = get_test_issue_source_query(issue_data) + lookup_query = get_test_issue_source_query(issue_data, limit) if not lookup_query: return "NA", "Source data lookup is not available for this test.", None, None diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index c698212a..8cc32c73 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -1,18 +1,36 @@ +from collections.abc import Callable +from datetime import UTC, datetime from typing import TypedDict +from uuid import UUID -from sqlalchemy.engine import Row +import streamlit as st -from testgen.commands.queries.profiling_query import CProfilingSQL -from testgen.common.database.database_service import get_flavor_service +from testgen.commands.queries.refresh_data_chars_query import ColumnChars, RefreshDataCharsSQL +from testgen.commands.run_refresh_data_chars import write_data_chars from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup from testgen.ui.services.database_service import fetch_from_target_db +class StatsPreview(TypedDict): + id: UUID + table_groups_name: str + table_group_schema: str + table_ct: int | None + column_ct: int | None + approx_record_ct: int | None + approx_data_point_ct: int | None + +class TablePreview(TypedDict): + column_ct: int + approx_record_ct: int | None + approx_data_point_ct: int | None + can_access: bool | None + + class TableGroupPreview(TypedDict): - schema: str - tables: dict[str, bool] - column_count: int + stats: StatsPreview + tables: dict[str, TablePreview] success: bool message: str | None @@ -21,52 +39,40 @@ def get_table_group_preview( table_group: TableGroup, connection: Connection | None = None, verify_table_access: bool = False, -) -> TableGroupPreview: +) -> tuple[TableGroupPreview, Callable[[UUID], None]]: table_group_preview: TableGroupPreview = { - "schema": table_group.table_group_schema, + "stats": { + "id": table_group.id, + "table_groups_name": table_group.table_groups_name, + "table_group_schema": table_group.table_group_schema, + }, "tables": {}, - "column_count": 0, "success": True, "message": None, } + save_data_chars = None + if connection or table_group.connection_id: try: connection = connection or Connection.get(table_group.connection_id) + table_group_preview, data_chars, sql_generator = _get_preview(table_group, connection) - table_group_results = _fetch_table_group_columns(connection, table_group) - - for column in table_group_results: - table_group_preview["schema"] = column["table_schema"] - table_group_preview["tables"][column["table_name"]] = None - table_group_preview["column_count"] += 1 - - if len(table_group_results) <= 0: - table_group_preview["success"] = False - table_group_preview["message"] = ( - "No tables found matching the criteria. Please check the Table Group configuration" - " or the database permissions." - ) + def save_data_chars(table_group_id: UUID) -> None: + # Unsaved table groups will not have an ID, so we have to update it after saving + sql_generator.table_group.id = table_group_id + write_data_chars(data_chars, sql_generator, datetime.now(UTC)) if verify_table_access: - schema_name = table_group_preview["schema"] - flavor_service = get_flavor_service(connection.sql_flavor) - quote = flavor_service.quote_character - for table_name in table_group_preview["tables"].keys(): + tables_preview = table_group_preview["tables"] + for table_name in tables_preview.keys(): try: - results = fetch_from_target_db( - connection, - ( - f"SELECT 1 FROM {quote}{schema_name}{quote}.{quote}{table_name}{quote} LIMIT 1" - if not flavor_service.use_top - else f"SELECT TOP 1 * FROM {quote}{schema_name}{quote}.{quote}{table_name}{quote}" - ), - ) + results = fetch_from_target_db(connection, *sql_generator.verify_access(table_name)) except Exception as error: - table_group_preview["tables"][table_name] = False + tables_preview[table_name]["can_access"] = False else: - table_group_preview["tables"][table_name] = results is not None and len(results) > 0 + tables_preview[table_name]["can_access"] = results is not None and len(results) > 0 - if not all(table_group_preview["tables"].values()): + if not all(table["can_access"] for table in tables_preview.values()): table_group_preview["message"] = ( "Some tables were not accessible. Please the check the database permissions." ) @@ -75,30 +81,79 @@ def get_table_group_preview( table_group_preview["message"] = error.args[0] else: table_group_preview["success"] = False - table_group_preview["message"] = "No connection selected. Please select a connection to preview the Table Group." - return table_group_preview - - -def _fetch_table_group_columns(connection: Connection, table_group: TableGroup) -> list[Row]: - profiling_table_set = table_group.profiling_table_set - - sql_generator = CProfilingSQL(table_group.project_code, connection.sql_flavor) - - sql_generator.table_groups_id = table_group.id - sql_generator.connection_id = str(table_group.connection_id) - sql_generator.profile_run_id = "" - sql_generator.data_schema = table_group.table_group_schema - sql_generator.parm_table_set = ( - ",".join([f"'{item.strip()}'" for item in profiling_table_set.split(",")]) - if profiling_table_set - else profiling_table_set - ) - sql_generator.parm_table_include_mask = table_group.profiling_include_mask - sql_generator.parm_table_exclude_mask = table_group.profiling_exclude_mask - sql_generator.profile_id_column_mask = table_group.profile_id_column_mask - sql_generator.profile_sk_column_mask = table_group.profile_sk_column_mask - sql_generator.profile_use_sampling = "Y" if table_group.profile_use_sampling else "N" - sql_generator.profile_sample_percent = table_group.profile_sample_percent - sql_generator.profile_sample_min_count = table_group.profile_sample_min_count - - return fetch_from_target_db(connection, *sql_generator.GetDDFQuery()) + table_group_preview["message"] = ( + "No connection selected. Please select a connection to preview the Table Group." + ) + + return table_group_preview, save_data_chars + + +def reset_table_group_preview() -> None: + _get_preview.clear() + + +@st.cache_data( + show_spinner=False, + hash_funcs={ + TableGroup: lambda x: ( + x.table_group_schema, + x.profiling_table_set, + x.profiling_include_mask, + x.profiling_exclude_mask, + ), + Connection: lambda x: x.to_dict(), + }, +) +def _get_preview( + table_group: TableGroup, + connection: Connection, +) -> tuple[TableGroupPreview, list[ColumnChars], RefreshDataCharsSQL]: + sql_generator = RefreshDataCharsSQL(connection, table_group) + data_chars = fetch_from_target_db(connection, *sql_generator.get_schema_ddf()) + data_chars = [ColumnChars(**column) for column in data_chars] + + preview: TableGroupPreview = { + "stats": { + "id": table_group.id, + "table_groups_name": table_group.table_groups_name, + "table_group_schema": table_group.table_group_schema, + "table_ct": 0, + "column_ct": 0, + "approx_record_ct": None, + "approx_data_point_ct": None, + }, + "tables": {}, + "success": True, + "message": None, + } + stats = preview["stats"] + tables = preview["tables"] + + for column in data_chars: + if not tables.get(column.table_name): + tables[column.table_name] = { + "column_ct": 0, + "approx_record_ct": column.approx_record_ct, + "approx_data_point_ct": None, + "can_access": None, + } + stats["table_ct"] += 1 + if column.approx_record_ct is not None: + stats["approx_record_ct"] = (stats["approx_record_ct"] or 0) + column.approx_record_ct + + stats["column_ct"] += 1 + tables[column.table_name]["column_ct"] += 1 + if column.approx_record_ct is not None: + stats["approx_data_point_ct"] = (stats["approx_data_point_ct"] or 0) + column.approx_record_ct + tables[column.table_name]["approx_data_point_ct"] = ( + tables[column.table_name]["approx_data_point_ct"] or 0 + ) + column.approx_record_ct + + if len(data_chars) <= 0: + preview["success"] = False + preview["message"] = ( + "No tables found matching the criteria. Please check the Table Group configuration" + " or the database permissions." + ) + + return preview, data_chars, sql_generator diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py index f11abea6..806ec032 100644 --- a/testgen/ui/queries/test_result_queries.py +++ b/testgen/ui/queries/test_result_queries.py @@ -43,19 +43,16 @@ def get_test_results( r.result_code as passed_ct, (1 - r.result_code)::INTEGER as exception_ct, CASE - WHEN result_status = 'Warning' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Warning' THEN 1 END::INTEGER as warning_ct, CASE - WHEN result_status = 'Failed' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Failed' THEN 1 END::INTEGER as failed_ct, CASE - WHEN result_status = 'Log' - AND result_message NOT ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Log' THEN 1 END::INTEGER as log_ct, CASE - WHEN result_message ILIKE 'Inactivated%%' THEN 1 + WHEN result_status = 'Error' THEN 1 END as execution_error_ct, p.project_code, r.table_groups_id::VARCHAR, r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 2d9e99f3..70e8f752 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -261,7 +261,11 @@ def render_grid_select( selected_column, paginator_column = st.columns([.5, .5]) with paginator_column: def on_page_change(): - st.session_state[f"{key}_page_change"] = True + # Ignore the on_change event fired during paginator initialization + if st.session_state.get(f"{key}_paginator_loaded", False): + st.session_state[f"{key}_page_change"] = True + else: + st.session_state[f"{key}_paginator_loaded"] = True page_index = testgen.paginator( count=len(df), diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index c492dde5..fc7938f2 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -15,7 +15,7 @@ from sqlalchemy.exc import DatabaseError, DBAPIError import testgen.ui.services.database_service as db -from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.database.database_service import empty_cache, get_flavor_service from testgen.common.models import with_database_session from testgen.common.models.connection import Connection, ConnectionMinimal @@ -119,6 +119,10 @@ def on_save_connection_clicked(updated_connection): elif updated_connection.get("project_pw_encrypted") == CLEAR_SENTINEL: updated_connection["project_pw_encrypted"] = "" + if updated_connection.get("connect_with_identity"): + updated_connection["project_user"] = "" + updated_connection["project_pw_encrypted"] = "" + updated_connection["sql_flavor"] = self._get_sql_flavor_from_value(updated_connection["sql_flavor_code"]).flavor set_save(True) @@ -143,11 +147,19 @@ def on_test_connection_clicked(updated_connection: dict) -> None: elif updated_connection.get("private_key_passphrase") == CLEAR_SENTINEL: updated_connection["private_key_passphrase"] = "" + if updated_connection.get("connect_with_identity"): + updated_connection["project_user"] = "" + updated_connection["project_pw_encrypted"] = "" + updated_connection["sql_flavor"] = self._get_sql_flavor_from_value(updated_connection["sql_flavor_code"]).flavor set_check_status(True) set_updated_connection(self._sanitize_connection_input(updated_connection)) + def on_setup_table_group_clicked(*_args) -> None: + table_group_queries.reset_table_group_preview() + self.setup_data_configuration(project_code, connection.connection_id) + results = None for key, value in get_updated_connection().items(): setattr(connection, key, value) @@ -188,7 +200,7 @@ def on_test_connection_clicked(updated_connection: dict) -> None: on_change_handlers={ "TestConnectionClicked": on_test_connection_clicked, "SaveConnectionClicked": on_save_connection_clicked, - "SetupTableGroupClicked": lambda _: self.setup_data_configuration(project_code, connection.connection_id), + "SetupTableGroupClicked": on_setup_table_group_clicked, "ConnectionUpdated": on_connection_updated, }, ) @@ -266,6 +278,7 @@ def on_save_table_group_clicked(payload: dict) -> None: run_profiling: bool = payload.get("run_profiling", False) set_new_table_group(table_group) + mark_for_preview(True) set_table_group_verified(table_group_verified) set_run_profiling(run_profiling) mark_for_save(True) @@ -328,8 +341,9 @@ def on_preview_table_group(payload: dict) -> None: ) table_group_preview = None + save_data_chars = None if should_preview(): - table_group_preview = table_group_queries.get_table_group_preview( + table_group_preview, save_data_chars = table_group_queries.get_table_group_preview( table_group, verify_table_access=should_verify_access(), ) @@ -346,6 +360,12 @@ def on_preview_table_group(payload: dict) -> None: monitor_schedule_timezone=st.session_state["browser_timezone"] or "UTC", ) + if save_data_chars: + try: + save_data_chars(table_group.id) + except Exception: + LOG.exception("Data characteristics refresh encountered errors") + if should_run_profiling: try: run_profiling_in_background(table_group.id) diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index f4984577..3c8d2fea 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -113,7 +113,7 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: "RunProfilingClicked": partial( run_profiling_dialog, project_code, - selected_table_group, + selected_table_group.id, ), "TableGroupSelected": on_table_group_selected, "ItemSelected": on_item_selected, @@ -234,7 +234,7 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: Ta "add_date": {"header": "First detected"}, "last_mod_date": {"header": "Modification detected"}, "drop_date": {"header": "Drop detected"}, - "record_ct": {"header": "Record count"}, + "record_ct": {"header": "Row count"}, "value_ct": {"header": "Value count"}, "distinct_value_ct": {"header": "Distinct values"}, "null_value_ct": {"header": "Null values"}, diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py index 3d5b6d6e..74d6dc02 100644 --- a/testgen/ui/views/dialogs/run_profiling_dialog.py +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -1,84 +1,72 @@ import time +from uuid import UUID import streamlit as st -from testgen.commands.run_profiling_bridge import run_profiling_in_background -from testgen.common.models import with_database_session -from testgen.common.models.table_group import TableGroup, TableGroupMinimal +from testgen.commands.run_profiling import run_profiling_in_background +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup from testgen.ui.components import widgets as testgen -from testgen.ui.session import session -from testgen.utils import to_dataframe +from testgen.ui.navigation.router import Router +from testgen.ui.session import session, temp_value -LINK_KEY = "run_profiling_dialog:keys:go-to-runs" LINK_HREF = "profiling-runs" @st.dialog(title="Run Profiling") -@with_database_session -def run_profiling_dialog(project_code: str, table_group: TableGroupMinimal | None = None, default_table_group_id: str | None = None) -> None: - if table_group: - table_group_id: str = str(table_group.id) - table_group_name: str = table_group.table_groups_name - else: - table_groups = TableGroup.select_minimal_where(TableGroup.project_code == project_code) - table_groups_df = to_dataframe(table_groups, TableGroupMinimal.columns()) - table_group_id: str = testgen.select( - label="Table Group", - options=table_groups_df, - value_column="id", - display_column="table_groups_name", - default_value=default_table_group_id, - required=True, - placeholder="Select table group to profile", - ) - if table_group_id: - table_group_name: str = table_groups_df.loc[table_groups_df["id"] == table_group_id, "table_groups_name"].iloc[0] - testgen.whitespace(1) +def run_profiling_dialog(project_code: str, table_group_id: str | UUID | None = None, allow_selection: bool = False) -> None: + if not table_group_id and not allow_selection: + raise ValueError("Table Group ID must be specified when selection is not allowed") - if table_group_id: - with st.container(): - st.markdown(f"Execute profiling for the table group **{table_group_name}**?") - st.markdown(":material/info: _Profiling will be performed in a background process._") + def on_go_to_profiling_runs_clicked(table_group_id: str) -> None: + set_navigation_params({"project_code": project_code, "table_group_id": table_group_id}) - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): - st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") + def on_run_profiling_confirmed(table_group: dict) -> None: + set_table_group(table_group) + set_run_profiling(True) - button_container = st.empty() - status_container = st.empty() + get_navigation_params, set_navigation_params = temp_value("run_profiling_dialog:go_to_profiling_run", default=None) + if params := get_navigation_params(): + Router().navigate(to=LINK_HREF, with_args=params) - with button_container: - _, button_column = st.columns([.85, .15]) - with button_column: - profile_button = st.button("Run Profiling", use_container_width=True, disabled=not table_group_id) + should_run_profiling, set_run_profiling = temp_value("run_profiling_dialog:run_profiling", default=False) + get_table_group, set_table_group = temp_value("run_profiling_dialog:table_group", default=None) - if profile_button: - button_container.empty() - status_container.info("Starting profiling run ...") + table_groups = TableGroup.select_stats( + project_code=project_code, + table_group_id=table_group_id if not allow_selection else None, + ) + + result = None + if should_run_profiling(): + selected_table_group = get_table_group() + success = True + message = f"Profiling run started for table group '{selected_table_group['table_groups_name']}'." + show_link = session.current_page != LINK_HREF try: - run_profiling_in_background(table_group_id) - except Exception as e: - status_container.error(f"Profiling run encountered errors: {e!s}.") + run_profiling_in_background(selected_table_group["id"]) + except Exception as error: + success = False + message = f"Profiling run could not be started: {error!s}." + show_link = False + result = {"success": success, "message": message, "show_link": show_link} - # The second condition is needed for the link to work - if profile_button or st.session_state.get(LINK_KEY): - with status_container.container(): - st.success( - f"Profiling run started for table group **{table_group_name}**." - ) + testgen.testgen_component( + "run_profiling_dialog", + props={ + "table_groups": [table_group.to_dict(json_safe=True) for table_group in table_groups], + "selected_id": str(table_group_id), + "allow_selection": allow_selection, + "result": result, + }, + on_change_handlers={ + "GoToProfilingRunsClicked": on_go_to_profiling_runs_clicked, + "RunProfilingConfirmed": on_run_profiling_confirmed, + }, + ) - if session.current_page != LINK_HREF: - testgen.link( - label="Go to Profiling Runs", - href=LINK_HREF, - params={ "project_code": project_code, "table_group": table_group_id }, - right_icon="chevron_right", - underline=False, - height=40, - key=LINK_KEY, - style="margin-left: auto; border-radius: 4px; border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", - ) - else: - time.sleep(2) - st.cache_data.clear() - st.rerun() + if result and result["success"] and not result["show_link"]: + time.sleep(2) + ProfilingRun.select_summary.clear() + st.rerun() diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 808451db..c01bd049 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -2,7 +2,7 @@ import streamlit as st -from testgen.commands.run_execute_tests import run_execution_steps_in_background +from testgen.commands.run_test_execution import run_test_execution_in_background from testgen.common.models import with_database_session from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal from testgen.ui.components import widgets as testgen @@ -42,7 +42,7 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No if testgen.expander_toggle(expand_label="Show CLI command", key="run_tests_dialog:keys:show-cli"): st.code( - f"testgen run-tests --project-key {project_code} --test-suite-key '{test_suite_name}'", + f"testgen run-tests --test-suite-id {test_suite_id}", language="shellSession" ) @@ -60,7 +60,7 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No status_container.info("Starting test run ...") try: - run_execution_steps_in_background(project_code, test_suite_name) + run_test_execution_in_background(test_suite_id) except Exception as e: status_container.error(f"Test run encountered errors: {e!s}.") diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 7c6d3b1d..8eb1f533 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -571,7 +571,7 @@ def source_data_dialog(selected_row): st.markdown("#### SQL Query") query = get_hygiene_issue_source_query(selected_row) if query: - st.code(query, language="sql", height=100) + st.code(query, language="sql", wrap_lines=True, height=100) with st.spinner("Retrieving source data..."): bad_data_status, bad_data_msg, _, df_bad = get_hygiene_issue_source_data(selected_row, limit=500) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index e789f3a4..0c5deff8 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -125,8 +125,8 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | selected, selected_row = fm.render_grid_select( df, - ["table_name", "column_name", "db_data_type", "semantic_data_type", "hygiene_issues"], - ["Table", "Column", "Data Type", "Semantic Data Type", "Hygiene Issues"], + ["table_name", "column_name", "db_data_type", "semantic_data_type", "hygiene_issues", "result_details"], + ["Table", "Column", "Data Type", "Semantic Data Type", "Hygiene Issues", "Details"], id_column="id", reset_pagination=filters_changed, bind_to_query=True, @@ -225,7 +225,7 @@ def get_excel_report_data( "db_data_type": {"header": "Data type"}, "datatype_suggestion": {"header": "Suggested data type"}, "semantic_data_type": {}, - "record_ct": {"header": "Record count"}, + "record_ct": {"header": "Row count"}, "value_ct": {"header": "Value count"}, "distinct_value_ct": {"header": "Distinct values"}, "null_value_ct": {"header": "Null values"}, @@ -269,6 +269,7 @@ def get_excel_report_data( "within_1mo_date_ct": {"header": "Within 1 month"}, "future_date_ct": {"header": "Future dates"}, "boolean_true_ct": {"header": "Boolean true values"}, + "result_details": {"header": "Details"}, } return get_excel_file_data( data, diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index ffea8d10..f0b442bb 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -75,7 +75,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs on_change_handlers={ "FilterApplied": on_profiling_runs_filtered, "RunSchedulesClicked": lambda *_: ProfilingScheduleDialog().open(project_code), - "RunProfilingClicked": lambda *_: run_profiling_dialog(project_code, None, table_group_id), + "RunProfilingClicked": lambda *_: run_profiling_dialog(project_code, table_group_id, allow_selection=True), "RefreshData": refresh_data, "RunsDeleted": partial(on_delete_runs, project_code, table_group_id), }, @@ -122,7 +122,7 @@ def get_job_arguments(self, arg_value: str) -> tuple[list[typing.Any], dict[str, def on_cancel_run(profiling_run: dict) -> None: process_status, process_message = process_service.kill_profile_run(to_int(profiling_run["process_id"])) if process_status: - ProfilingRun.update_status(profiling_run["profiling_run_id"], "Cancelled") + ProfilingRun.cancel_run(profiling_run["id"]) fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) @@ -171,7 +171,7 @@ def on_delete_confirmed(*_args) -> None: if profiling_run.status == "Running": process_status, _ = process_service.kill_profile_run(to_int(profiling_run.process_id)) if process_status: - ProfilingRun.update_status(profiling_run.profiling_run_id, "Cancelled") + ProfilingRun.cancel_run(profiling_run.id) ProfilingRun.cascade_delete(profiling_run_ids) st.rerun() except Exception: diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index 25a19c25..9148128c 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -80,7 +80,7 @@ def render( issues = None with st.spinner(text="Loading data :gray[:small[(This might take a few minutes)]] ..."): user_can_edit = session.auth.user_has_permission("edit") - score_card = format_score_card(score_definition.as_cached_score_card()) + score_card = format_score_card(score_definition.as_cached_score_card(include_definition=True)) if score_type not in typing.get_args(ScoreTypes): score_type = None if not score_type: diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 78da867d..3bf9ca98 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -7,7 +7,7 @@ import streamlit as st from sqlalchemy.exc import IntegrityError -from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.models import with_database_session from testgen.common.models.connection import Connection from testgen.common.models.project import Project @@ -18,6 +18,7 @@ from testgen.ui.queries import table_group_queries from testgen.ui.session import session, temp_value from testgen.ui.views.connections import FLAVOR_OPTIONS, format_connection +from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog from testgen.ui.views.profiling_runs import ProfilingScheduleDialog LOG = logging.getLogger("testgen") @@ -63,6 +64,14 @@ def render( table_groups = TableGroup.select_minimal_where(*table_group_filters) connections = self._get_connections(project_code) + def on_add_table_group_clicked(*_args) -> None: + table_group_queries.reset_table_group_preview() + self.add_table_group_dialog(project_code, connection_id) + + def on_edit_table_group_clicked(table_group_id: str) -> None: + table_group_queries.reset_table_group_preview() + self.edit_table_group_dialog(project_code, table_group_id) + return testgen.testgen_component( "table_group_list", props={ @@ -77,10 +86,10 @@ def render( }, on_change_handlers={ "RunSchedulesClicked": lambda *_: ProfilingScheduleDialog().open(project_code), - "AddTableGroupClicked": partial(self.add_table_group_dialog, project_code, connection_id), - "EditTableGroupClicked": partial(self.edit_table_group_dialog, project_code), + "AddTableGroupClicked": on_add_table_group_clicked, + "EditTableGroupClicked": on_edit_table_group_clicked, "DeleteTableGroupClicked": partial(self.delete_table_group_dialog, project_code), - "RunProfilingClicked": partial(self.run_profiling_dialog, project_code), + "RunProfilingClicked": partial(run_profiling_dialog, project_code), "TableGroupsFiltered": lambda params: self.router.queue_navigation( to="table-groups", with_args={"project_code": project_code, **params}, @@ -90,7 +99,7 @@ def render( @st.dialog(title="Add Table Group") @with_database_session - def add_table_group_dialog(self, project_code: str, connection_id: str | None, *_args): + def add_table_group_dialog(self, project_code: str, connection_id: str | None): return self._table_group_wizard( project_code, connection_id=connection_id, @@ -134,6 +143,7 @@ def on_save_table_group_clicked(payload: dict): table_group_verified: bool = payload.get("table_group_verified", False) run_profiling: bool = payload.get("run_profiling", False) + mark_for_preview(True) set_save(True) set_table_group(table_group) set_table_group_verified(table_group_verified) @@ -182,6 +192,7 @@ def on_go_to_profiling_runs(params: dict) -> None: setattr(table_group, key, value) table_group_preview = None + save_data_chars = None if is_table_group_used: table_group.table_group_schema = original_table_group_schema @@ -201,7 +212,7 @@ def on_go_to_profiling_runs(params: dict) -> None: ] if should_preview(): - table_group_preview = table_group_queries.get_table_group_preview( + table_group_preview, save_data_chars = table_group_queries.get_table_group_preview( table_group, verify_table_access=should_verify_access(), ) @@ -217,6 +228,13 @@ def on_go_to_profiling_runs(params: dict) -> None: add_monitor_test_suite=add_monitor_test_suite, monitor_schedule_timezone=st.session_state["browser_timezone"] or "UTC", ) + + if save_data_chars: + try: + save_data_chars(table_group.id) + except Exception: + LOG.exception("Data characteristics refresh encountered errors") + if should_run_profiling(): try: run_profiling_in_background(table_group.id) @@ -286,52 +304,6 @@ def _format_table_group_list( return formatted_list - @st.dialog(title="Run Profiling") - def run_profiling_dialog(self, project_code: str, table_group_id: str) -> None: - def on_go_to_profiling_runs_clicked(table_group_id: str) -> None: - set_navigation_params({ "project_code": project_code, "table_group_id": table_group_id }) - - def on_run_profiling_confirmed(*_args) -> None: - set_run_profiling(True) - - get_navigation_params, set_navigation_params = temp_value( - f"table_groups:{table_group_id}:go_to_profiling_run", - default=None, - ) - if (params := get_navigation_params()): - self.router.navigate(to="profiling-runs", with_args=params) - - should_run_profiling, set_run_profiling = temp_value( - f"table_groups:{table_group_id}:run_profiling", - default=False, - ) - - table_group = TableGroup.get_minimal(table_group_id) - result = None - if should_run_profiling(): - success = True - message = "Profiling run started" - - try: - run_profiling_in_background(table_group_id) - except Exception as error: - success = False - message = f"Profiling run encountered errors: {error!s}." - result = {"success": success, "message": message} - - return testgen.testgen_component( - "run_profiling_dialog", - props={ - "project_code": project_code, - "table_group": table_group.to_dict(json_safe=True), - "result": result, - }, - on_change_handlers={ - "GoToProfilingRunsClicked": on_go_to_profiling_runs_clicked, - "RunProfilingConfirmed": on_run_profiling_confirmed, - }, - ) - @st.dialog(title="Delete Table Group") @with_database_session def delete_table_group_dialog(self, project_code: str, table_group_id: str): diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 8b8a2a89..1f02a906 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -6,7 +6,7 @@ import pandas as pd import streamlit as st -from sqlalchemy import and_, asc, func, or_, tuple_ +from sqlalchemy import and_, asc, desc, func, or_, tuple_ from streamlit.delta_generator import DeltaGenerator from streamlit_extras.no_default_selectbox import selectbox @@ -74,7 +74,7 @@ def render( ], ) - table_filter_column, column_filter_column, test_filter_column, table_actions_column = st.columns([.3, .3, .3, .4], vertical_alignment="bottom") + table_filter_column, column_filter_column, test_filter_column, sort_column, table_actions_column = st.columns([.2, .2, .2, .1, .25], vertical_alignment="bottom") testgen.flex_row_end(table_actions_column) actions_column, disposition_column = st.columns([.5, .5]) @@ -123,6 +123,15 @@ def render( label="Test Type", ) + with sort_column: + sortable_columns = ( + ("Table", "table_name"), + ("Column", "column_name"), + ("Test Type", "test_type"), + ) + default = [(sortable_columns[i][1], "ASC") for i in (0, 1, 2)] + sorting_columns = testgen.sorting_selector(sortable_columns, default) + if user_can_disposition: with disposition_column: multi_select = st.toggle("Multi-Select", help="Toggle on to perform actions on multiple test definitions") @@ -142,7 +151,7 @@ def render( with st.container(): with st.spinner("Loading data ..."): - df = get_test_definitions(test_suite, table_name, column_name, test_type) + df = get_test_definitions(test_suite, table_name, column_name, test_type, sorting_columns) selected, selected_test_def = render_grid(df, multi_select, filters_changed) @@ -1147,6 +1156,7 @@ def get_test_definitions( table_name: str | None = None, column_name: str | None = None, test_type: str | None = None, + sorting_columns: list[str] | None = None, ) -> pd.DataFrame: clauses = [TestDefinition.test_suite_id == test_suite.id] if table_name: @@ -1155,7 +1165,15 @@ def get_test_definitions( clauses.append(TestDefinition.column_name.ilike(column_name)) if test_type: clauses.append(TestDefinition.test_type == test_type) - test_definitions = TestDefinition.select_where(*clauses) + + sort_funcs = {"ASC": asc, "DESC": desc} + test_definitions = TestDefinition.select_where( + *clauses, + order_by=tuple([ + sort_funcs[direction](func.lower(getattr(TestDefinition, attribute))) + for (attribute, direction) in sorting_columns + ]) if sorting_columns else None, + ) df = to_dataframe(test_definitions, TestDefinitionSummary.columns()) date_service.accommodate_dataframe_to_timezone(df, st.session_state) @@ -1217,7 +1235,7 @@ def get_columns(table_groups_id: str) -> list[dict]: def validate_test(test_definition, table_group: TableGroupMinimal): schema = test_definition["schema_name"] table_name = test_definition["table_name"] - connection = Connection.get_by_table_group(table_group.id) + connection = Connection.get(table_group.connection_id) if test_definition["test_type"] == "Condition_Flag": condition = test_definition["custom_query"] diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index e88fc859..9bfbb653 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -741,7 +741,7 @@ def render_binary_chart(data: pd.DataFrame, **params: dict) -> None: history["test_start"] = history["test_date"].apply(datetime.fromisoformat) history["test_end"] = history["test_start"].apply(lambda start: start + timedelta(seconds=60)) history["formatted_test_date"] = history["test_date"].apply(lambda date_str: datetime.fromisoformat(date_str).strftime("%I:%M:%S %p, %d/%m/%Y")) - history["result_measure_with_status"] = history.apply(lambda row: f"{legend_labels[str(int(row['result_measure']))]} ({row['result_status']})", axis=1) + history["result_measure_with_status"] = history.apply(lambda row: f"{legend_labels[str(int(row['result_measure'])) if not pd.isnull(row['result_measure']) else "0"]} ({row['result_status']})", axis=1) fig = px.timeline( history, @@ -814,7 +814,7 @@ def source_data_dialog(selected_row): else: query = get_test_issue_source_query(selected_row) if query: - st.code(query, language="sql", height=100) + st.code(query, language="sql", wrap_lines=True, height=100) with st.spinner("Retrieving source data..."): if selected_row["test_type"] == "CUSTOM": diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 33cf379d..a1a802e3 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -118,22 +118,22 @@ def init(self) -> None: self.test_suites = TestSuite.select_minimal_where(TestSuite.project_code == self.project_code) def get_arg_value(self, job): - return job.kwargs["test_suite_key"] + return next(item.test_suite for item in self.test_suites if str(item.id) == job.kwargs["test_suite_id"]) def get_arg_value_options(self) -> list[dict[str, str]]: return [ - {"value": test_suite.test_suite, "label": test_suite.test_suite} + {"value": str(test_suite.id), "label": test_suite.test_suite} for test_suite in self.test_suites ] def get_job_arguments(self, arg_value: str) -> tuple[list[typing.Any], dict[str, typing.Any]]: - return [], {"project_key": self.project_code, "test_suite_key": arg_value} + return [], {"test_suite_id": str(arg_value)} def on_cancel_run(test_run: dict) -> None: process_status, process_message = process_service.kill_test_run(to_int(test_run["process_id"])) if process_status: - TestRun.update_status(test_run["test_run_id"], "Cancelled") + TestRun.cancel_run(test_run["test_run_id"]) fm.reset_post_updates(str_message=f":{'green' if process_status else 'red'}[{process_message}]", as_toast=True) @@ -181,7 +181,7 @@ def on_delete_confirmed(*_args) -> None: if test_run.status == "Running": process_status, _ = process_service.kill_test_run(to_int(test_run.process_id)) if process_status: - TestRun.update_status(test_run.test_run_id, "Cancelled") + TestRun.cancel_run(test_run.test_run_id) TestRun.cascade_delete(test_run_ids) st.rerun() except Exception: diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 31a75f0f..0345c42a 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -105,14 +105,12 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal connection_id = selected_test_suite.connection_id if mode == "edit" else None table_groups_id = selected_test_suite.table_groups_id if mode == "edit" else None test_suite_description = empty_if_null(selected_test_suite.test_suite_description) if mode == "edit" else "" - test_action = empty_if_null(selected_test_suite.test_action) if mode == "edit" else "" try: severity_index = severity_options.index(selected_test_suite.severity) if mode == "edit" else 0 except ValueError: severity_index = 0 export_to_observability = selected_test_suite.export_to_observability if mode == "edit" else False dq_score_exclude = selected_test_suite.dq_score_exclude if mode == "edit" else False - test_suite_schema = empty_if_null(selected_test_suite.test_suite_schema) if mode == "edit" else "" component_key = empty_if_null(selected_test_suite.component_key) if mode == "edit" else "" component_type = empty_if_null(selected_test_suite.component_type) if mode == "edit" else "dataset" component_name = empty_if_null(selected_test_suite.component_name) if mode == "edit" else "" @@ -140,7 +138,6 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal "test_suite_description": left_column.text_input( label="Test Suite Description", max_chars=40, value=test_suite_description ), - "test_action": test_action, "severity": right_column.selectbox( label="Severity", options=severity_options, @@ -148,7 +145,6 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal index=severity_index, help="Overrides the default severity in 'Test Definition' and/or 'Test Run'.", ), - "test_suite_schema": test_suite_schema, "export_to_observability": left_column.checkbox( "Export to Observability", value=export_to_observability, diff --git a/testgen/utils/__init__.py b/testgen/utils/__init__.py index 3e864034..2b295ff7 100644 --- a/testgen/utils/__init__.py +++ b/testgen/utils/__init__.py @@ -58,6 +58,10 @@ def try_json(value: str | None, default: T | None) -> T: return json.loads(value) except: return default + + +def get_exception_message(exception: Exception) -> str: + return exception.args[0].rstrip() if exception.args and isinstance(exception.args[0], str) else str(exception) # https://github.com/streamlit/streamlit/issues/798#issuecomment-1647759949 diff --git a/tests/unit/test_common_email.py b/tests/unit/test_common_email.py new file mode 100644 index 00000000..f0e94cee --- /dev/null +++ b/tests/unit/test_common_email.py @@ -0,0 +1,68 @@ +from unittest.mock import ANY, call, patch + +import pytest + +from testgen.common.email import BaseEmailTemplate, EmailTemplateException + + +class TestEmailTemplate(BaseEmailTemplate): + + def get_subject_template(self) -> str: + return "{{project}}: Test execution finished" + + def get_body_template(self) -> str: + return "

DataKitchen TestGen

Hi, {{user}}!

" + + +@pytest.fixture +def smtp_mock(): + with patch("testgen.common.email.smtplib.SMTP_SSL") as mock: + yield mock + + +@pytest.fixture +def def_settings(): + with patch("testgen.common.email.settings") as mock: + mock.EMAIL_FROM_ADDRESS = "from@email" + mock.SMTP_ENDPOINT = "smtp-endpoint" + mock.SMTP_PORT = 333 + mock.SMTP_USERNAME = "smtp-user" + mock.SMTP_PASSWORD = "smtp-pass" # noqa: S105 + yield mock + + +@pytest.fixture +def template(smtp_mock, def_settings): + yield TestEmailTemplate() + + +@pytest.fixture +def send_args(): + return ["test@data.kitchen"], {"project": "Test Project", "user": "Test user"} + + +def test_send_email(smtp_mock, template, send_args, def_settings): + template.send(*send_args) + + smtp_mock.assert_has_calls( + [ + call("smtp-endpoint", 333, context=ANY), + call().__enter__().login("smtp-user", "smtp-pass"), + call().__enter__().sendmail("from@email", ["test@data.kitchen"], ANY) + ], + any_order=True, + ) + email_body = smtp_mock().__enter__().sendmail.call_args_list[0][0][2] + assert "

DataKitchen TestGen

" in email_body + assert "Subject: Test Project: Test execution finished" in email_body + assert "

Hi, Test user!

" in email_body + + +@pytest.mark.parametrize( + "missing", + ("EMAIL_FROM_ADDRESS", "SMTP_ENDPOINT", "SMTP_PORT", "SMTP_USERNAME", "SMTP_PASSWORD") +) +def test_settings_validation(missing, template, def_settings, send_args): + setattr(def_settings, missing, None) + with pytest.raises(EmailTemplateException, match="Invalid or insufficient email/SMTP settings"): + template.send(*send_args) diff --git a/tests/unit/test_profiling_query.py b/tests/unit/test_profiling_query.py deleted file mode 100644 index 368fb5b6..00000000 --- a/tests/unit/test_profiling_query.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest - -from testgen.commands.queries.profiling_query import CProfilingSQL - - -@pytest.mark.unit -def test_include_exclude_mask_basic(): - # test configuration - project_code = "dummy_project_code" - flavor = "postgresql" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = "important%, %useful%" - profiling_query.parm_table_exclude_mask = "temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" - - # test run - query, _ = profiling_query.GetDDFQuery() - - # test assertions - assert "SELECT 'dummy_project_code'" in query - assert r"""AND ( - (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' ) - )""" in query - assert r"""AND NOT ( - (c.table_name LIKE 'temp%' ) OR (c.table_name LIKE 'tmp%' ) OR (c.table_name LIKE 'raw\_slot\_utilization%' ) OR (c.table_name LIKE 'gps\_product\_step\_change\_log' ) - )""" in query - - -@pytest.mark.unit -@pytest.mark.parametrize("mask", ("", None)) -def test_include_empty_exclude_mask(mask): - # test configuration - project_code = "dummy_project_code" - flavor = "snowflake" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = mask - profiling_query.parm_table_exclude_mask = "temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" - - # test run - query, _ = profiling_query.GetDDFQuery() - print(query) - - # test assertions - assert r"""AND NOT ( - (c.table_name LIKE 'temp%' ESCAPE '\\') OR (c.table_name LIKE 'tmp%' ESCAPE '\\') OR (c.table_name LIKE 'raw\\_slot\\_utilization%' ESCAPE '\\') OR (c.table_name LIKE 'gps\\_product\\_step\\_change\\_log' ESCAPE '\\') - )""" in query - - -@pytest.mark.unit -@pytest.mark.parametrize("mask", ("", None)) -def test_include_empty_include_mask(mask): - # test configuration - project_code = "dummy_project_code" - flavor = "mssql" - profiling_query = CProfilingSQL(project_code, flavor) - profiling_query.parm_table_set = "" - profiling_query.parm_table_include_mask = "important%, %useful_%" - profiling_query.parm_table_exclude_mask = mask - - # test run - query, _ = profiling_query.GetDDFQuery() - print(query) - - # test assertions - assert r"""AND ( - (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' ) - )""" in query diff --git a/tests/unit/test_refresh_data_chars_query.py b/tests/unit/test_refresh_data_chars_query.py new file mode 100644 index 00000000..a84bc139 --- /dev/null +++ b/tests/unit/test_refresh_data_chars_query.py @@ -0,0 +1,62 @@ +import pytest + +from testgen.commands.queries.refresh_data_chars_query import RefreshDataCharsSQL +from testgen.common.models.connection import Connection +from testgen.common.models.table_group import TableGroup + + +@pytest.mark.unit +def test_include_exclude_mask_basic(): + connection = Connection(sql_flavor="postgresql") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask="important%, %useful%", + profiling_exclude_mask="temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert "WHERE c.table_schema = 'test_schema'" in query + assert r"""AND ( + (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful%' ) + )""" in query + assert r"""AND NOT ( + (c.table_name LIKE 'temp%' ) OR (c.table_name LIKE 'tmp%' ) OR (c.table_name LIKE 'raw\_slot\_utilization%' ) OR (c.table_name LIKE 'gps\_product\_step\_change\_log' ) + )""" in query + + +@pytest.mark.unit +@pytest.mark.parametrize("mask", ("", None)) +def test_include_empty_exclude_mask(mask): + connection = Connection(sql_flavor="snowflake") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask=mask, + profiling_exclude_mask="temp%,tmp%,raw_slot_utilization%,gps_product_step_change_log" + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert r"""AND NOT ( + (c.table_name LIKE 'temp%' ESCAPE '\\') OR (c.table_name LIKE 'tmp%' ESCAPE '\\') OR (c.table_name LIKE 'raw\\_slot\\_utilization%' ESCAPE '\\') OR (c.table_name LIKE 'gps\\_product\\_step\\_change\\_log' ESCAPE '\\') + )""" in query + + +@pytest.mark.unit +@pytest.mark.parametrize("mask", ("", None)) +def test_include_empty_include_mask(mask): + connection = Connection(sql_flavor="mssql") + table_group = TableGroup( + table_group_schema="test_schema", + profiling_table_set="", + profiling_include_mask="important%, %useful_%", + profiling_exclude_mask=mask, + ) + sql_generator = RefreshDataCharsSQL(connection, table_group) + query, _ = sql_generator.get_schema_ddf() + + assert r"""AND ( + (c.table_name LIKE 'important%' ) OR (c.table_name LIKE '%useful[_]%' ) + )""" in query