diff --git a/src/intugle/adapters/types/bigquery/bigquery.py b/src/intugle/adapters/types/bigquery/bigquery.py index f121a05..8fd38e0 100644 --- a/src/intugle/adapters/types/bigquery/bigquery.py +++ b/src/intugle/adapters/types/bigquery/bigquery.py @@ -1,4 +1,5 @@ import time + from typing import TYPE_CHECKING, Any, Optional import numpy as np @@ -18,11 +19,17 @@ try: from google.cloud import bigquery from google.oauth2 import service_account - from sqlglot import transpile + GOOGLE_BIGQUERY_AVAILABLE = True +except ImportError: + GOOGLE_BIGQUERY_AVAILABLE = False - BIGQUERY_AVAILABLE = True +try: + from sqlglot import exp, transpile + SQLGLOT_AVAILABLE = True except ImportError: - BIGQUERY_AVAILABLE = False + SQLGLOT_AVAILABLE = False + +BIGQUERY_AVAILABLE = GOOGLE_BIGQUERY_AVAILABLE and SQLGLOT_AVAILABLE class BigQueryAdapter(Adapter): @@ -98,15 +105,8 @@ def connect(self): def _get_fqn(self, identifier: str) -> str: """Gets the fully qualified name for a table identifier.""" if "." in identifier: - # Already has project or dataset prefix - parts = identifier.split(".") - if len(parts) == 2: - # dataset.table format - return f"`{self._project_id}.{identifier}`" - elif len(parts) == 3: - # project.dataset.table format - return f"`{identifier}`" - return f"`{self._project_id}.{self._dataset_id}.{identifier}`" + return exp.to_table(identifier).sql(dialect="bigquery") + return exp.to_table(identifier, db=self._dataset_id, catalog=self._project_id).sql(dialect="bigquery") @staticmethod def check_data(data: Any) -> BigQueryConfig: diff --git a/src/intugle/adapters/types/databricks/databricks.py b/src/intugle/adapters/types/databricks/databricks.py index 23e48b9..be7d4e6 100644 --- a/src/intugle/adapters/types/databricks/databricks.py +++ b/src/intugle/adapters/types/databricks/databricks.py @@ -36,7 +36,7 @@ DATABRICKS_SQL_AVAILABLE = False try: - from sqlglot import transpile + from sqlglot import exp, transpile SQLGLOT_AVAILABLE = True except ImportError: SQLGLOT_AVAILABLE = False @@ -136,19 +136,10 @@ def connect(self): def _get_fqn(self, identifier: str) -> str: """Gets the fully qualified name for a table identifier.""" - # An identifier is already fully qualified if it contains a dot. if "." in identifier: - return identifier + return exp.to_table(identifier).sql(dialect="databricks") - # Backticks are used to handle reserved keywords and special characters. - safe_schema = f"`{self._schema}`" - safe_identifier = f"`{identifier}`" - - if self.catalog: - safe_catalog = f"`{self.catalog}`" - return f"{safe_catalog}.{safe_schema}.{safe_identifier}" - - return f"{safe_schema}.{safe_identifier}" + return exp.to_table(identifier, db=self._schema, catalog=self.catalog).sql(dialect="databricks") @staticmethod def check_data(data: Any) -> DatabricksConfig: diff --git a/src/intugle/adapters/types/mariadb/mariadb.py b/src/intugle/adapters/types/mariadb/mariadb.py index b948855..178bee1 100644 --- a/src/intugle/adapters/types/mariadb/mariadb.py +++ b/src/intugle/adapters/types/mariadb/mariadb.py @@ -24,7 +24,7 @@ MARIADB_CONNECTOR_AVAILABLE = False try: - from sqlglot import transpile + from sqlglot import exp, transpile SQLGLOT_AVAILABLE = True except Exception: @@ -112,9 +112,8 @@ def _quote_id(identifier: str) -> str: def _get_fqn(self, identifier: str) -> str: if "." in identifier: - parts = identifier.split(".") - return ".".join(self._quote_id(p) for p in parts) - return f"{self._quote_id(self.database)}.{self._quote_id(identifier)}" + return exp.to_table(identifier).sql(dialect="mysql") # MariaDB uses MySQL dialect in sqlglot + return exp.to_table(identifier, db=self._database).sql(dialect="mysql") @staticmethod def check_data(data: Any) -> MariaDBConfig: diff --git a/src/intugle/adapters/types/oracle/oracle.py b/src/intugle/adapters/types/oracle/oracle.py index d3e9ce1..20ba94e 100644 --- a/src/intugle/adapters/types/oracle/oracle.py +++ b/src/intugle/adapters/types/oracle/oracle.py @@ -24,7 +24,7 @@ ORACLE_AVAILABLE = False try: - from sqlglot import transpile + from sqlglot import exp, transpile SQLGLOT_AVAILABLE = True except ImportError: @@ -103,9 +103,10 @@ def connect(self): cursor.execute(f"ALTER SESSION SET CURRENT_SCHEMA = {params.schema_}") def _get_fqn(self, identifier: str) -> str: + """Gets the fully qualified name for a table identifier.""" if "." in identifier: - return identifier.upper() # Oracle identifiers are case-insensitive/upper by default unless quoted - return f'"{self._schema}"."{identifier}"' + return exp.to_table(identifier).sql(dialect="oracle") + return exp.to_table(identifier, db=self._schema).sql(dialect="oracle") @staticmethod def check_data(data: Any) -> OracleConfig: diff --git a/src/intugle/adapters/types/postgres/postgres.py b/src/intugle/adapters/types/postgres/postgres.py index 57ee0eb..71f2dad 100644 --- a/src/intugle/adapters/types/postgres/postgres.py +++ b/src/intugle/adapters/types/postgres/postgres.py @@ -26,7 +26,7 @@ ASYNC_PG_AVAILABLE = False try: - from sqlglot import transpile + from sqlglot import exp, transpile SQLGLOT_AVAILABLE = True except ImportError: @@ -150,8 +150,8 @@ async def _connect_async(self): def _get_fqn(self, identifier: str) -> str: """Gets the fully qualified name for a table identifier.""" if "." in identifier: - return identifier - return f'"{self._schema}"."{identifier}"' + return exp.to_table(identifier).sql(dialect="postgres") + return exp.to_table(identifier, db=self._schema).sql(dialect="postgres") @staticmethod def check_data(data: Any) -> PostgresConfig: diff --git a/src/intugle/adapters/types/snowflake/snowflake.py b/src/intugle/adapters/types/snowflake/snowflake.py index e01c43e..495495d 100644 --- a/src/intugle/adapters/types/snowflake/snowflake.py +++ b/src/intugle/adapters/types/snowflake/snowflake.py @@ -253,7 +253,7 @@ def _sync_metadata(self, manifest: "Manifest"): # Apply comments and tags to tables and columns for source in manifest.sources.values(): # Construct the fully qualified table name using details from profiles.yml - full_table_name = f"{database}.{schema}.{source.table.name}" + full_table_name = f"{quote_identifier(database)}.{quote_identifier(schema)}.{quote_identifier(source.table.name)}" # Set table comment if source.table.description: diff --git a/src/intugle/adapters/types/sqlserver/sqlserver.py b/src/intugle/adapters/types/sqlserver/sqlserver.py index 71816a9..f0d47b1 100644 --- a/src/intugle/adapters/types/sqlserver/sqlserver.py +++ b/src/intugle/adapters/types/sqlserver/sqlserver.py @@ -27,7 +27,7 @@ MSSQL_PYTHON_AVAILABLE = False try: - from sqlglot import transpile + from sqlglot import exp, transpile SQLGLOT_AVAILABLE = True except ImportError: @@ -112,8 +112,8 @@ def connect(self): def _get_fqn(self, identifier: str) -> str: """Gets the fully qualified name for a table identifier.""" if "." in identifier: - return identifier - return f'[{self._schema}].[{identifier}]' + return exp.to_table(identifier).sql(dialect="tsql") + return exp.to_table(identifier, db=self._schema).sql(dialect="tsql") @staticmethod def check_data(data: Any) -> SQLServerConfig: diff --git a/src/intugle/data_product.py b/src/intugle/data_product.py index 71f136b..69abdfc 100644 --- a/src/intugle/data_product.py +++ b/src/intugle/data_product.py @@ -186,7 +186,7 @@ def get_all_field_details(self) -> dict[str, FieldDetailsModel]: datatype_l1=column.type, datatype_l2=column.category, sql_code=f"\"{source.table.name}\".\"{column.name}\"", - is_pii=False, + is_pii="pii" in [t.lower() for t in (column.tags or [])], asset_id=source.table.name, asset_name=source.table.name, asset_details={}, @@ -235,7 +235,7 @@ def field_details_fetcher(ids: list[str]): datatype_l1=column_detail.type, datatype_l2=column_detail.category, sql_code=f"\"{table}\".\"{column}\"", - is_pii=False, + is_pii="pii" in [t.lower() for t in (column_detail.tags or [])], asset_id=table, asset_name=table, asset_details={}, diff --git a/src/intugle/mcp/docs_search/service.py b/src/intugle/mcp/docs_search/service.py index 134185c..db6bb94 100644 --- a/src/intugle/mcp/docs_search/service.py +++ b/src/intugle/mcp/docs_search/service.py @@ -1,7 +1,7 @@ import asyncio import os -from typing import Any, Dict, List, Optional +from typing import List, Optional import aiohttp diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py index 802edf6..e9bfe87 100644 --- a/src/intugle/semantic_model.py +++ b/src/intugle/semantic_model.py @@ -1,5 +1,6 @@ import logging +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List import pandas as pd @@ -11,7 +12,6 @@ from intugle.link_predictor.predictor import LinkPredictor from intugle.semantic_search import SemanticSearch from intugle.utils.files import update_relationship_file_mtime -from pathlib import Path if TYPE_CHECKING: from intugle.adapters.adapter import Adapter @@ -83,6 +83,7 @@ def _initialize_from_list(self, data_list: List[DataSet]): "DataSet objects provided in a list must have a 'name' attribute." ) self.datasets[dataset.name] = dataset + def _initialize_from_folder(self, folder_path: str): """ Initialize datasets by scanning a folder (recursively) for supported data files. @@ -133,7 +134,6 @@ def _initialize_from_folder(self, folder_path: str): f"No supported data files (.csv, .parquet, .xlsx) found in directory: {folder_path}" ) - def profile(self, force_recreate: bool = False): """ Runs the data profiling pipeline for all contained datasets. diff --git a/tests/adapters/test_bigquery_adapter.py b/tests/adapters/test_bigquery_adapter.py index b416620..6ffda82 100644 --- a/tests/adapters/test_bigquery_adapter.py +++ b/tests/adapters/test_bigquery_adapter.py @@ -1,9 +1,11 @@ +from unittest.mock import MagicMock, patch + import pytest -from unittest.mock import MagicMock, patch, PropertyMock + from intugle.adapters.types.bigquery.bigquery import ( + BIGQUERY_AVAILABLE, BigQueryAdapter, can_handle_bigquery, - BIGQUERY_AVAILABLE, ) from intugle.adapters.types.bigquery.models import BigQueryConfig, BigQueryConnectionConfig diff --git a/tests/adapters/test_utils.py b/tests/adapters/test_utils.py index 567a96a..e19a27b 100644 --- a/tests/adapters/test_utils.py +++ b/tests/adapters/test_utils.py @@ -1,7 +1,9 @@ import numpy as np -from intugle.adapters.utils import convert_to_native import pytest +from intugle.adapters.utils import convert_to_native + + def test_numpy_scalar_int(): value = np.int64(10) result = convert_to_native(value) diff --git a/tests/security/test_adapter_security.py b/tests/security/test_adapter_security.py new file mode 100644 index 0000000..c5c1214 --- /dev/null +++ b/tests/security/test_adapter_security.py @@ -0,0 +1,160 @@ + +from unittest.mock import MagicMock, patch + +from intugle.adapters.types.bigquery.bigquery import BigQueryAdapter +from intugle.adapters.types.databricks.databricks import DatabricksAdapter +from intugle.adapters.types.mariadb.mariadb import MariaDBAdapter +from intugle.adapters.types.oracle.oracle import OracleAdapter +from intugle.adapters.types.postgres.postgres import PostgresAdapter +from intugle.adapters.types.sqlserver.sqlserver import SQLServerAdapter +from intugle.data_product import DataProduct + + +# --- PII Logic Tests --- +def test_data_product_pii_logic(): + """Verify that is_pii is correctly derived from tags.""" + # Mock Manifest and Source + mock_column_pii = MagicMock() + mock_column_pii.name = "email" + mock_column_pii.type = "string" + mock_column_pii.category = "dimension" + mock_column_pii.tags = ["PII", "sensitive"] + + mock_column_normal = MagicMock() + mock_column_normal.name = "age" + mock_column_normal.type = "integer" + mock_column_normal.category = "measure" + mock_column_normal.tags = ["demographic"] + + mock_table = MagicMock() + mock_table.name = "users" + mock_table.details = {} + mock_table.columns = [mock_column_pii, mock_column_normal] + + mock_source = MagicMock() + mock_source.schema_ = "public" + mock_source.table = mock_table + + # Mock DataProduct to load our mock manifest + with patch("intugle.data_product.ManifestLoader") as MockLoader: + mock_loader_instance = MockLoader.return_value + mock_loader_instance.manifest.sources = {"users": mock_source} + mock_loader_instance.manifest.relationships = {} + + # Patch DataSet to avoid initialization errors with empty details + with patch("intugle.data_product.DataSet"): + dp = DataProduct() + + # Check PII field + pii_field_id = "users.email" + assert pii_field_id in dp.field_details + assert dp.field_details[pii_field_id].is_pii is True, "email should be PII" + + # Check Normal field + normal_field_id = "users.age" + assert normal_field_id in dp.field_details + assert dp.field_details[normal_field_id].is_pii is False, "age should not be PII" + +# --- Adapter SQL Injection Tests --- + + +def test_postgres_adapter_fqn_safety(): + """Test PostgresAdapter _get_fqn with malicious input.""" + # We don't need a real connection for _get_fqn usually, but let's see implementation. + # PostgresAdapter requires settings and async runner in init. We mock them. + with patch("intugle.adapters.types.postgres.postgres.POSTGRES_AVAILABLE", True): + # Bypass init connection + with patch.object(PostgresAdapter, "connect"): + with patch("intugle.adapters.types.postgres.postgres.AsyncRunner"): + adapter = PostgresAdapter() + adapter._schema = "public" + + malicious_table = 'users"; DROP TABLE accounts; --' + try: + fqn = adapter._get_fqn(malicious_table) + + # Expected safe: "public"."users""; DROP TABLE accounts; --" + # sqlglot quotes identifiers with double quotes and escapes existing double quotes with another double quote. + print(f"Postgres FQN: {fqn}") + + # Verify that the quote is escaped (doubled) + assert '""' in fqn or '\\"' in fqn + except Exception as e: + # sqlglot might raise TokenError or ParseError on malformed/malicious input. + # This effectively prevents injection. + print(f"Caught expected error preventing injection: {e}") + pass + + +def test_databricks_adapter_fqn_safety(): + with patch("intugle.adapters.types.databricks.databricks.DATABRICKS_AVAILABLE", True): + with patch.object(DatabricksAdapter, "connect"): + adapter = DatabricksAdapter() + adapter._schema = "default" + adapter.catalog = "hive_metastore" + + malicious_table = 'users`; DROP TABLE accounts; --' + fqn = adapter._get_fqn(malicious_table) + print(f"Databricks FQN: {fqn}") + + assert '``;' in fqn + + +def test_sqlserver_adapter_fqn_safety(): + with patch("intugle.adapters.types.sqlserver.sqlserver.SQLSERVER_AVAILABLE", True): + with patch.object(SQLServerAdapter, "connect"): + adapter = SQLServerAdapter() + adapter._schema = "dbo" + + malicious_table = 'users]; DROP TABLE accounts; --' + fqn = adapter._get_fqn(malicious_table) + print(f"SQLServer FQN: {fqn}") + + # Expected: [dbo].[users]]; DROP TABLE accounts; --] + assert ']];' in fqn + + +def test_oracle_adapter_fqn_safety(): + with patch("intugle.adapters.types.oracle.oracle.ORACLE_ADAPTER_AVAILABLE", True): + with patch.object(OracleAdapter, "connect"): + adapter = OracleAdapter() + adapter._schema = "HR" + + malicious_table = 'users"; DROP TABLE accounts; --' + # Oracle uses sqlglot now, so expect similar behavior to Postgres (safe quoting or error) + try: + fqn = adapter._get_fqn(malicious_table) + print(f"Oracle FQN: {fqn}") + assert '""' in fqn or '\\"' in fqn + except Exception as e: + print(f"Caught expected error preventing injection: {e}") + pass + + +def test_mariadb_adapter_fqn_safety(): + with patch("intugle.adapters.types.mariadb.mariadb.MARIADB_AVAILABLE", True): + with patch.object(MariaDBAdapter, "connect"): + adapter = MariaDBAdapter() + adapter._database = "inventory" # MariaDB uses database as schema + + malicious_table = 'users`; DROP TABLE accounts; --' + fqn = adapter._get_fqn(malicious_table) + print(f"MariaDB FQN: {fqn}") + + # Should look like `inventory`.`users`` ...` + assert '``;' in fqn + + +def test_bigquery_adapter_fqn_safety(): + with patch("intugle.adapters.types.bigquery.bigquery.BIGQUERY_AVAILABLE", True): + with patch.object(BigQueryAdapter, "connect"): + adapter = BigQueryAdapter() + adapter._project_id = "my-project" + adapter._dataset_id = "analytics" + + malicious_table = 'users`; DROP TABLE accounts; --' + fqn = adapter._get_fqn(malicious_table) + print(f"BigQuery FQN: {fqn}") + + # Should look like `my-project`.`analytics`.`users`` ...` (escaped backtick) + assert '``;' in fqn