diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index bf27e03f47..6c4dfc9144 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -304,6 +304,7 @@ workflows: - spark - clickhouse - risingwave + - starrocks - engine_tests_cloud: name: cloud_engine_<< matrix.engine >> context: diff --git a/.circleci/wait-for-db.sh b/.circleci/wait-for-db.sh index a313320279..5ab06dd832 100755 --- a/.circleci/wait-for-db.sh +++ b/.circleci/wait-for-db.sh @@ -50,6 +50,34 @@ spark_ready() { probe_port 15002 } +starrocks_ready() { + probe_port 9030 + + echo "Checking for 1 alive StarRocks backends..." + sleep 5 + + while true; do + echo "Checking StarRocks backends..." + ALIVE_BACKENDS=$(docker exec -i starrocks-fe mysql -h127.0.0.1 -P9030 -uroot -e "show backends \G" | grep -c "^ *Alive: true *$") + + # fallback value if failed to get number + if ! [[ "$ALIVE_BACKENDS" =~ ^[0-9]+$ ]]; then + echo "WARN: Unable to parse number of alive backends, got: '$ALIVE_BACKENDS'" + ALIVE_BACKENDS=0 + fi + + echo "Found $ALIVE_BACKENDS alive backends" + + if [ "$ALIVE_BACKENDS" -ge 1 ]; then + echo "StarRocks has 1 or more alive backends" + break + fi + + echo "Waiting for more backends to become alive..." + sleep 5 + done +} + trino_ready() { # Trino has a built-in healthcheck script, just call that docker compose -f tests/core/engine_adapter/integration/docker/compose.trino.yaml exec trino /bin/bash -c '/usr/lib/trino/bin/health-check' diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 68d856c589..ee4794538f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,7 +6,7 @@ build: python: "3.10" jobs: pre_build: - - pip install -e ".[athena,azuresql,bigframes,bigquery,clickhouse,databricks,dbt,dlt,gcppostgres,github,llm,mssql,mysql,mwaa,postgres,redshift,slack,snowflake,trino,web,risingwave]" + - pip install -e ".[athena,azuresql,bigframes,bigquery,clickhouse,databricks,dbt,dlt,gcppostgres,github,llm,mssql,mysql,mwaa,postgres,redshift,slack,snowflake,starrocks,trino,web,risingwave]" - make api-docs mkdocs: diff --git a/Makefile b/Makefile index 611b179eba..828328c119 100644 --- a/Makefile +++ b/Makefile @@ -208,6 +208,9 @@ trino-test: engine-trino-up risingwave-test: engine-risingwave-up pytest -n auto -m "risingwave" --reruns 3 --junitxml=test-results/junit-risingwave.xml +starrocks-test: engine-starrocks-up + pytest -n auto -m "starrocks" --reruns 3 --junitxml=test-results/junit-starrocks.xml + ################# # Cloud Engines # ################# diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md index d6d4f20c11..a8f1183280 100644 --- a/docs/guides/configuration.md +++ b/docs/guides/configuration.md @@ -920,6 +920,7 @@ These pages describe the connection configuration options for each execution eng * [GCP Postgres](../integrations/engines/gcp-postgres.md) * [Redshift](../integrations/engines/redshift.md) * [Snowflake](../integrations/engines/snowflake.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Spark](../integrations/engines/spark.md) * [Trino](../integrations/engines/trino.md) @@ -952,6 +953,7 @@ Unsupported state engines, even for development: * [ClickHouse](../integrations/engines/clickhouse.md) * [Spark](../integrations/engines/spark.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Trino](../integrations/engines/trino.md) This example gateway configuration uses Snowflake for the data warehouse connection and Postgres for the state backend connection: diff --git a/docs/guides/connections.md b/docs/guides/connections.md index e0dca0f7a4..bc763f3f5a 100644 --- a/docs/guides/connections.md +++ b/docs/guides/connections.md @@ -90,4 +90,5 @@ default_gateway: local_db * [Redshift](../integrations/engines/redshift.md) * [Snowflake](../integrations/engines/snowflake.md) * [Spark](../integrations/engines/spark.md) +* [StarRocks](../integrations/engines/starrocks.md) * [Trino](../integrations/engines/trino.md) diff --git a/docs/integrations/engines/starrocks.md b/docs/integrations/engines/starrocks.md new file mode 100644 index 0000000000..cc7ab8a2e9 --- /dev/null +++ b/docs/integrations/engines/starrocks.md @@ -0,0 +1,524 @@ +# StarRocks + +## Overview + +[StarRocks](https://www.starrocks.io/) is a next-generation sub-second MPP OLAP database designed for real-time analytics. It provides high concurrency, low latency, and supports both batch and stream processing. + +SQLMesh supports StarRocks through its MySQL-compatible protocol, providing StarRocks-specific optimizations for table models, indexing, partitioning, and more. The adapter leverages StarRocks's strengths for analytical workloads with sensible defaults and advanced configuration support. + +## Prerequisites + +* Install SQLMesh with the StarRocks extra: + +```bash +pip install "sqlmesh[starrocks]" +``` + +* Initialize a SQLMesh project (if you haven't already): + +```bash +sqlmesh init +``` + +* Configure a separate state backend: + * StarRocks is currently **not supported** as a SQLMesh `state_connection`. + * Use DuckDB (recommended) or another engine for SQLMesh state. + +## Connection Configuration Example + +```yaml linenums="1" hl_lines="2 4-8 13-15" +gateways: + starrocks: + connection: + type: starrocks + host: starrocks-fe # Frontend (FE) node address + port: 9030 # Query port (default: 9030) + user: starrocks_user + password: your_password + database: your_database + # Optional MySQL-compatible settings + # charset: utf8mb4 + # connect_timeout: 60 + state_connection: + type: duckdb + database: ./state/sqlmesh_state.db + +default_gateway: starrocks + +model_defaults: + dialect: starrocks +``` + +### StarRocks setup note (optional) + +If you're running a shared-nothing cluster with a single backend, you may need to adjust the default replication number: + +```sql +ADMIN SET frontend config ("default_replication_num" = "1"); +``` + +## Quickstart + +### 1) A minimal table (DUPLICATE KEY default) + +```sql +MODEL ( + name user_events, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); + +SELECT + user_id, + event_time, + event_type +FROM source.user_events; +``` + +A `DUPLICATE KEY` table can usually be used as a `FULL` kind model. + +### 2) An incremental table (PRIMARY KEY recommended) + +```sql +MODEL ( + name user_events_inc, + kind INCREMENTAL_BY_TIME_RANGE( + time_column event_date + ), + physical_properties ( + primary_key = (user_id, event_date), + partition_by = (date_trunc('day', event_date)), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); + +SELECT + user_id, + event_date, + COUNT(*) AS cnt +FROM source.user_events +WHERE event_date BETWEEN @start_ds AND @end_ds +GROUP BY user_id, event_date; +``` + +## Table Types + +StarRocks supports four table types: **DUPLICATE KEY**, **PRIMARY KEY**, **UNIQUE KEY**, and **AGGREGATE KEY**. + +SQLMesh configures StarRocks table types via `physical_properties` (engine-specific table properties). + +> **Note**: StarRocks `AGGREGATE KEY` requires per-value-column aggregation functions, which SQLMesh model syntax **DOES NOT** currently support. Use `PRIMARY KEY` or `DUPLICATE KEY` instead. + +### DUPLICATE KEY Type (Default) + +If you do not set a key type, StarRocks creates a DUPLICATE KEY table by default. + +**Example:** + +```sql +MODEL ( + name user_events, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); +``` + +### PRIMARY KEY Type + +For incremental models, **PRIMARY KEY tables are needed** (and effectively required for robust deletes) because StarRocks supports *weaker* `DELETE ... WHERE ...` on non-primary-key table types. + +SQLMesh will apply conservative `WHERE` transformations for compatibility (for example, converting `BETWEEN` to `>= AND <=`, removing boolean literals, and converting `DELETE ... WHERE TRUE` to `TRUNCATE TABLE`). To avoid limitations and keep incremental maintenance reliable, use a `PRIMARY KEY` table by setting `physical_properties.primary_key`. + +> SQLMesh currently does not support specifying `primary_key` as a model parameter. + +**Example (INCREMENTAL_BY_TIME_RANGE):** + +```sql +MODEL ( + name user_events, + kind INCREMENTAL_BY_TIME_RANGE( + time_column event_date + ), + physical_properties ( + primary_key = (user_id, event_date), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); + +SELECT + user_id, + event_date, + COUNT(*) AS cnt +FROM source.user_events +WHERE event_date BETWEEN @start_ds AND @end_ds +GROUP BY user_id, event_date; +``` + +### UNIQUE KEY Type + +You can create a UNIQUE KEY table by setting `physical_properties.unique_key`. In most incremental use cases, a PRIMARY KEY table is recommended instead. + +**Example:** + +```sql +MODEL ( + name user_events_unique, + kind FULL, + physical_properties ( + unique_key = (user_id, event_date), + distributed_by = (kind=HASH, expressions=user_id, buckets=16) + ) +); +``` + +## Table Properties + +This section documents StarRocks engine-specific table properties via `physical_properties (...)` (table properties). Most properties support: + +* **Structured form** (recommended): easier validation and clearer intent +* **String fallback**: for convenience or when you want to paste native StarRocks syntax quickly + +Most of the time, the value syntax is the same or similar as a corresponding clause in StarRocks, espacially for a **string** type value. + +When specifying **string** values, prefer **single quotes**. + +### Configuration Matrix + +| Property | Where | Recommended form | String fallback | Notes | +| --- | --- | --- | --- | --- | +| `primary_key` | `physical_properties` | `primary_key = (col1, col2)` | `primary_key = 'col1, col2'` | Required for PRIMARY KEY tables (recommended for incremental). | +| `duplicate_key` | `physical_properties` | `duplicate_key = (col1, col2)` | `duplicate_key = 'col1, col2'` | Explicitly sets DUPLICATE KEY table type. | +| `unique_key` | `physical_properties` | `unique_key = (col1, col2)` | `unique_key = 'col1, col2'` | Sets UNIQUE KEY table type. | +| `partitioned_by` / `partition_by` | `MODEL` / `physical_properties` | `partitioned_by (dt)` (model param) / `partition_by = RANGE(dt, region)` (table property) | `partition_by = 'RANGE(dt, region)'` | Its' recommended to use `partition_by` in `physical_properties` for RANGE/LIST partitioning together with `partitions`. | +| `partitions` | `physical_properties` | `partitions = ('PARTITION ...', 'PARTITION ...')` | `partitions = 'PARTITION ...'` | Initial partitions; easiest to express as strings. When using RANGE or LIST partitioning, you need to specify initial `partitions`. | +| `distributed_by` | `physical_properties` | `distributed_by = (kind=HASH, expressions=(c1, c2), buckets=10)` | `distributed_by = 'HASH(c1, c2) BUCKETS 10'` / `distributed_by = 'RANDOM'` | | +| `clustered_by` / `order_by` | `MODEL` / `physical_properties` | `clustered_by (col1, col2)` / `order_by = (col1, col2)` | `order_by = 'col1, col2'` | Ordering/clustering columns for query performance if it's not the same as the table key. | +| Other properties | `physical_properties` | Use strings (recommended) | Use strings | StarRocks `PROPERTIES` are string key/value pairs. | + +**Notes:** + +* You can use enum-like values without quotes (for example `HASH`, `RANDOM`, `IMMEDIATE`), but strings are also accepted (prefer single quotes). +* Aliases exist for convenience: use `partition_by` (table property) as an alias of `partitioned_by` (model parameter), and `order_by` ↔ `clustered_by`. +* Only several properties can be set as model +parameters: `partitioned_by`, `clustered_by`. But, for +simplity, you're recommended to use table properties +only. + +### Table Key Properties + +Table key properties accept multiple forms: + +* **Structured**: `col` or `(col1, col2, ...)` +* **String**: `'col'` or `'col1, col2'` + +**Syntax:** + +* Structured: `primary_key = col`, `primary_key = (col1, col2)`, `duplicate_key = (col2)` +* String: `primary_key = 'col1, col2'`, `unique_key = '(col2, col3)'`. + +#### PRIMARY KEY + +```sql +MODEL ( + name my_pk_table, + kind FULL, + physical_properties ( + primary_key = (id, ds), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +#### DUPLICATE KEY + +```sql +MODEL ( + name my_dup_table, + kind FULL, + physical_properties ( + duplicate_key = (id, ds), + distributed_by = RANDOM + ) +); +``` + +#### UNIQUE KEY + +```sql +MODEL ( + name my_unique_table, + kind FULL, + physical_properties ( + unique_key = (id, ds), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +### Partitioning + +StarRocks supports `RANGE` partitioning, `LIST` partitioning, and **expression partitioning**. + +You can specify partitioning either: + +* As a **model parameter**: `partitioned_by (...)` (good for simple expressions) +* As a **table property**: `physical_properties(partition_by=...)` (recommended when you need RANGE/LIST, or complex expressions) + +For `RANGE` and `LIST` partitioning, you generally need to provide initial `partitions` (pre-created partitions). For expression partitioning, `partitions` is usually not needed. + +#### `partitioned_by` / `partition_by` + +NOTE: + +* `partitioned_by (...)` can only be used as a model parameter (SQLMesh enforces this constraint). +* `partition_by` can be provided in `physical_properties` as table properties (for advanced partitioning). + +**Syntax:** + +* Expression list: `partitioned_by (col)` / `partitioned_by (expr1, expr2)` + * for complex example: `partition_by = (date_trunc('day', col2), col3)` +* RANGE/LIST: `partition_by = RANGE(col1, col2)` / `partition_by = LIST(col1, col2)` +* String fallback: `partition_by = 'RANGE(col1, col2)'` + +#### `partitions` + +**Syntax:** + +* Tuple of strings: `partitions = ('PARTITION ...', 'PARTITION ...')` +* Single string: `partitions = 'PARTITION ...'` + +#### Expression partitioning + +```sql +MODEL ( + name my_partitioned_model, + kind INCREMENTAL_BY_TIME_RANGE(time_column event_date), + partitioned_by (date_trunc('day', event_time), region), + physical_properties ( + primary_key = (user_id, event_date, region), + distributed_by = (kind=HASH, expressions=user_id, buckets=10) + ) +); +``` + +#### RANGE partitioning + +```sql +MODEL ( + name my_partitioned_model_advanced, + kind FULL, + physical_properties ( + partition_by = RANGE(event_time), + partitions = ( + 'PARTITION p20240101 VALUES [("2024-01-01"), ("2024-01-02"))', + 'PARTITION p20240102 VALUES [("2024-01-02"), ("2024-01-03"))' + ), + distributed_by = (kind=HASH, expressions=region, buckets=10) + ) +); +``` + +It's similar for `LIST` partitioning as `RANGE` partitioning. + +### Distribution + +StarRocks supports both `HASH` and `RANDOM` distribution. You can use a structured value or a string. + +1. Structured type syntax: ```(kind= [, expressions=] [, buckets=])``` + + * **kind**: `HASH` OR `RANDOM`. + * **expressions**: a single column or a tuple of columns, such as `col1` or `(col1, col2)`. (optional) + * **buckets**: bucket number. (optional) + +2. String type is similar as: `'HASH(id) BUCKETS 10'`, which is the same as the distribution clause in StarRocks's `CREATE TABLE`. +3. Or even a single enum-like value: `distributed_by = RANDOM`. + +#### HASH distribution + +Structured type (recommended): + +```sql +MODEL ( + name my_table, + kind FULL, + physical_properties ( + distributed_by = (kind=HASH, expressions=(user_id), buckets=10) + ) +); +``` + +#### RANDOM distribution + +Simple enumerate type: + +```sql +MODEL ( + name my_table_random, + kind FULL, + physical_properties ( + distributed_by = RANDOM + ) +); +``` + +#### String fallback + +A single string, which is the same as the clause in StarRocks's `CREATE TABLE`. + +```sql +MODEL ( + name my_table_string_dist, + kind FULL, + physical_properties ( + distributed_by = 'HASH(user_id) BUCKETS 10' + ) +); +``` + +### Ordering + +You can use `clustered_by` or `order_by` to specify the column ordering to optimize query performance if it's not the same the table key. + +You can specify `clustered_by` both as a model parameter and a table property, but you can only specify `order_by` as a table property. + +**Syntax:** + +* Structured: `order_by = col` / `order_by = (col1, col2)` +* String fallback: `order_by = 'col1, col2'` + +```sql +MODEL ( + name my_ordered_table, + kind FULL, + physical_properties ( + order_by = (ds, id), + distributed_by = (kind=HASH, expressions=id, buckets=10) + ) +); +``` + +### Generic PROPERTIES + +Any additional properties in `physical_properties` are passed through as StarRocks `PROPERTIES`. Since StarRocks `PROPERTIES` values are typically strings, using strings is recommended. + +```sql +MODEL ( + name advanced_table, + kind FULL, + physical_properties ( + primary_key = (id), + distributed_by = (kind=HASH, expressions=id, buckets=8), + replication_num = '1', + storage_medium = 'SSD', + enable_persistent_index = 'true', + compression = 'LZ4' + ) +); +``` + +## Views and Materialized Views + +### Views + +StarRocks supports view `SECURITY` via `physical_properties.security`. + +**Syntax:** + +* `security = INVOKER` or `security = NONE`. (optional) + +```sql +MODEL ( + name user_summary_view, + kind VIEW, + physical_properties ( + security = INVOKER + ) +); + +SELECT + user_id, + COUNT(*) AS event_count, + MAX(event_time) AS last_event_time +FROM user_events +GROUP BY user_id; +``` + +### Materialized Views (MV) + +SQLMesh uses `kind VIEW (materialized true)` to create materialized views. + +You can specify StarRocks MV refresh settings using the same `physical_properties` block. + +**Refresh properties:** + +* `refresh_moment`: `IMMEDIATE` or `DEFERRED` (optional) +* `refresh_scheme`: `MANUAL` or `ASYNC ...` (optional) + * Examples: `ASYNC`, `MANUAL`, `ASYNC START ("2024-01-01 00:00:00") EVERY (INTERVAL 5 MINUTE)` + * The syntax of `ASYNC ...` clause is the same as the clause in StarRocks. + +```sql +MODEL ( + name user_summary_mv, + kind VIEW ( + materialized true + ), + physical_properties ( + refresh_moment = DEFERRED, + refresh_scheme = 'ASYNC START ("2024-01-01 00:00:00") EVERY (INTERVAL 5 MINUTE)' + ) +); + +SELECT + user_id, + COUNT(*) AS event_count, + MAX(event_time) AS last_event_time +FROM user_events +GROUP BY user_id; +``` + +**Other properties:** + +You can specify `partitioning`, `distribution`, `order by` and `properties` the same as normal table properties. But notice that only supported MV properties are useful, Refer to StarRocks' doc for MV creation. + +**Notes:** + +* If you create materialized views with `replace=true`, SQLMesh may drop and recreate the MV. When an MV is dropped, its data is removed and the MV must be refreshed/rebuilt again. +* There are some restriction for `partitioning`, you need to refer StarRocks' doc for MV partitioning specification. +* StarRocks MV schema supports a column list but does **not** support explicit data types in that list. Column data types come from the `AS SELECT ...` query. +* If you create MVs from a dataframe via the Python API, provide `target_columns_to_types` (a `Dict[str, exp.DataType]`). If you don't care about exact types, you can set all columns to `VARCHAR` as a fallback: + +```python +from sqlglot import exp + +target_columns_to_types = { + "col1": exp.DataType.build("VARCHAR"), + "col2": exp.DataType.build("VARCHAR"), +} +``` + +## Limitations + +* **No sync MV support (currently)**: synchronous materialized views are not supported yet. +* **No tuple IN**: StarRocks does not support `(c1, c2) IN ((v1, v2), ...)`. +* **No `SELECT ... FOR UPDATE`**: StarRocks is an OLAP database and does not support row locks; SQLMesh removes `FOR UPDATE` when executing SQLGlot expressions. +* **RENAME caveat**: `ALTER TABLE db.old RENAME db.new` is not supported; the `RENAME` target cannot be qualified with a database name. + +## Dependencies + +To use StarRocks with SQLMesh, install the required MySQL driver: + +```bash +pip install "sqlmesh[starrocks]" +# or +pip install pymysql +``` + +## Resources + +* [StarRocks Documentation](https://docs.starrocks.io/) +* [StarRocks Table Design Guide](https://docs.starrocks.io/docs/table_design/StarRocks_table_design/) +* [StarRocks SQL Reference](https://docs.starrocks.io/docs/sql-reference/sql-statements/data-definition/CREATE_TABLE/) diff --git a/docs/integrations/overview.md b/docs/integrations/overview.md index 94b9289d21..10525fecea 100644 --- a/docs/integrations/overview.md +++ b/docs/integrations/overview.md @@ -26,4 +26,5 @@ SQLMesh supports the following execution engines for running SQLMesh projects (e * [Redshift](./engines/redshift.md) (redshift) * [Snowflake](./engines/snowflake.md) (snowflake) * [Spark](./engines/spark.md) (spark) +* [StarRocks](./engines/starrocks.md) (starrocks) * [Trino](./engines/trino.md) (trino) diff --git a/mkdocs.yml b/mkdocs.yml index 47ddca54e9..a3fca4c3d3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,7 @@ nav: - integrations/engines/risingwave.md - integrations/engines/snowflake.md - integrations/engines/spark.md + - integrations/engines/starrocks.md - integrations/engines/trino.md - Resources: - comparisons.md diff --git a/pyproject.toml b/pyproject.toml index 2c140d4770..a498fd02f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,6 +124,7 @@ snowflake = [ "snowflake-connector-python[pandas,secure-local-storage]", "snowflake-snowpark-python", ] +starrocks = ["pymysql"] trino = ["trino"] web = [ "fastapi==0.115.5", @@ -271,6 +272,7 @@ markers = [ "pyspark: test for PySpark that need to run separately from the other spark tests", "trino: test for Trino (all connectors)", "risingwave: test for Risingwave", + "starrocks: test for StarRocks", # Other "set_default_connection", diff --git a/sqlmesh/core/config/__init__.py b/sqlmesh/core/config/__init__.py index 42ed82c6e6..50d2d9a5a2 100644 --- a/sqlmesh/core/config/__init__.py +++ b/sqlmesh/core/config/__init__.py @@ -22,6 +22,7 @@ RedshiftConnectionConfig as RedshiftConnectionConfig, SnowflakeConnectionConfig as SnowflakeConnectionConfig, SparkConnectionConfig as SparkConnectionConfig, + StarRocksConnectionConfig as StarRocksConnectionConfig, TrinoConnectionConfig as TrinoConnectionConfig, parse_connection_config as parse_connection_config, ) diff --git a/sqlmesh/core/config/connection.py b/sqlmesh/core/config/connection.py index 638f0c28c8..18907b76ca 100644 --- a/sqlmesh/core/config/connection.py +++ b/sqlmesh/core/config/connection.py @@ -57,6 +57,7 @@ "trino", # Nullable types are problematic "clickhouse", + "starrocks", } MOTHERDUCK_TOKEN_REGEX = re.compile(r"(\?|\&)(motherduck_token=)(\S*)") PASSWORD_REGEX = re.compile(r"(password=)(\S+)") @@ -2326,6 +2327,80 @@ def init(cursor: t.Any) -> None: return init +class StarRocksConnectionConfig(ConnectionConfig): + """Configuration for the StarRocks connection. + + StarRocks uses MySQL network protocol and is compatible with MySQL ecosystem tools, + JDBC/ODBC drivers, and various visualization tools. + + Args: + host: The hostname of the StarRocks FE (Frontend) node. + user: The StarRocks username. + password: The StarRocks password. + port: The port number of the StarRocks FE node. Default is 9030. + database: The optional database name. + charset: The optional character set. TODO: may be not supported yet. + collation: The optional collation. TODO: may be not supported yet. + ssl_disabled: Whether to disable SSL connection. TODO: need to check it. + concurrent_tasks: The maximum number of tasks that can use this connection concurrently. + register_comments: Whether or not to register model comments with the SQL engine. + local_infile: Whether or not to allow local file access. + pre_ping: Whether or not to pre-ping the connection before starting a new transaction to ensure it is still alive. + """ + + host: str + user: str + password: str + port: t.Optional[int] = 9030 + database: t.Optional[str] = None + charset: t.Optional[str] = None + collation: t.Optional[str] = None + ssl_disabled: t.Optional[bool] = None + + concurrent_tasks: int = 4 + register_comments: bool = True + local_infile: bool = False + pre_ping: bool = True + + type_: t.Literal["starrocks"] = Field(alias="type", default="starrocks") + DIALECT: t.ClassVar[t.Literal["starrocks"]] = "starrocks" + DISPLAY_NAME: t.ClassVar[t.Literal["StarRocks"]] = "StarRocks" + DISPLAY_ORDER: t.ClassVar[t.Literal[18]] = 18 + + _engine_import_validator = _get_engine_import_validator("pymysql", "starrocks") + + @property + def _connection_kwargs_keys(self) -> t.Set[str]: + connection_keys = { + "host", + "user", + "password", + } + if self.port is not None: + connection_keys.add("port") + if self.database is not None: + connection_keys.add("database") + if self.charset is not None: + connection_keys.add("charset") + if self.collation is not None: + connection_keys.add("collation") + if self.ssl_disabled is not None: + connection_keys.add("ssl_disabled") + if self.local_infile is not None: + connection_keys.add("local_infile") + return connection_keys + + @property + def _engine_adapter(self) -> t.Type[EngineAdapter]: + return engine_adapter.StarRocksEngineAdapter + + @property + def _connection_factory(self) -> t.Callable: + from pymysql import connect + + return connect + + CONNECTION_CONFIG_TO_TYPE = { # Map all subclasses of ConnectionConfig to the value of their `type_` field. tpe.all_field_infos()["type_"].default: tpe diff --git a/sqlmesh/core/engine_adapter/__init__.py b/sqlmesh/core/engine_adapter/__init__.py index ab29885c7b..cb9db5ea77 100644 --- a/sqlmesh/core/engine_adapter/__init__.py +++ b/sqlmesh/core/engine_adapter/__init__.py @@ -16,6 +16,7 @@ from sqlmesh.core.engine_adapter.redshift import RedshiftEngineAdapter from sqlmesh.core.engine_adapter.snowflake import SnowflakeEngineAdapter from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter from sqlmesh.core.engine_adapter.trino import TrinoEngineAdapter from sqlmesh.core.engine_adapter.athena import AthenaEngineAdapter from sqlmesh.core.engine_adapter.risingwave import RisingwaveEngineAdapter @@ -37,6 +38,7 @@ "athena": AthenaEngineAdapter, "risingwave": RisingwaveEngineAdapter, "fabric": FabricEngineAdapter, + "starrocks": StarRocksEngineAdapter, } DIALECT_ALIASES = { diff --git a/sqlmesh/core/engine_adapter/starrocks.py b/sqlmesh/core/engine_adapter/starrocks.py new file mode 100644 index 0000000000..18a0d8a4e5 --- /dev/null +++ b/sqlmesh/core/engine_adapter/starrocks.py @@ -0,0 +1,3388 @@ +from __future__ import annotations + +import logging +import re +import sqlglot +from sqlglot import exp +import typing as t + +from sqlmesh.core.engine_adapter.base import ( + InsertOverwriteStrategy, + get_source_columns_to_types, +) +from sqlmesh.core.engine_adapter.mixins import ( + ClusteredByMixin, + LogicalMergeMixin, + PandasNativeFetchDFSupportMixin, +) +from sqlmesh.core.engine_adapter.shared import ( + CommentCreationTable, + CommentCreationView, + DataObject, + DataObjectType, + set_catalog, + to_schema, +) +from sqlmesh.core.node import IntervalUnit +from sqlmesh.utils.errors import SQLMeshError + +if t.TYPE_CHECKING: + from sqlmesh.core._typing import SchemaName, TableName + from sqlmesh.core.engine_adapter._typing import QueryOrDF + +logger = logging.getLogger(__name__) + + +############################################################################### +# Declarative Type System for Property Validation and Normalization +############################################################################### +""" +Declarative type system for property validation and normalization. + +This module provides a declarative way to define property types with clear separation +between validation (type checking) and normalization (type conversion). +""" +Validated = t.Any # validated intermediate value (AST nodes, string, list...) +Normalized = t.Any # final normalized output + +# Allowed outputs for EnumType normalize / or general property outputs. +PROPERTY_OUTPUT_TYPES = { + "str", # "HASH" + "var", # exp.Var("ASYNC") + "identifier", # exp.Identifier + "literal", # exp.Literal.string("HASH") + "column", # exp.Column(this="HASH") + "ast_expr", # generic exp.Expression +} + + +# ============================================================ +# Fragment parser (robust-ish) +# ============================================================ +def parse_fragment(text: str) -> t.Union[exp.Expression, t.List[exp.Expression]]: + """ + Try to parse a DSL fragment into SQLGlot AST(s). + + Behavior: + 1. If parse_one succeeds, return the exp.Expression. + 2. If fails but text contains comma, split by commas and parse each part. + 3. If it's parenthesized like "(a, b)", parse and return exp.Tuple or list. + 4. If it's a simple token like "IDENT", return exp.Identifier. + """ + if isinstance(text, exp.Expression): + return text + + if not isinstance(text, str): + raise TypeError("parse_fragment expects a string") + + s = text.strip() + try: + parsed = sqlglot.parse_one(s) + return parsed + except Exception: + raise ValueError(f"Unable to parse fragment: {s}") + + +# ============================================================ +# Base Type +# ============================================================ +class DeclarativeType: + """ + Base class for declarative type system. + + Design Philosophy: + ----------------- + - validate(value): Type checking only - returns validated intermediate value or None + - normalize(validated): Type conversion only - transforms to target output format + + Methods: + -------- + validate(value) -> Optional[Validated] + Check if value conforms to this type, maybe include some tiny different types + Returns: Validated intermediate value if valid, None otherwise. + + normalize(validated) -> Normalized + Convert validated intermediate value to final output format. + Returns: Normalized value in target format. + + __call__(value) -> Normalized + Convenience method: validate + normalize in one step. + """ + + def validate(self, value: t.Any) -> t.Optional[Validated]: + """Check if value conforms to this type. Return validated value or None. + String that can be parsed as literal + """ + raise NotImplementedError(f"{self.__class__.__name__}.validate() must be implemented") + + def normalize(self, validated: Validated) -> Normalized: + """Convert validated intermediate value to final output format.""" + # Default: identity transformation + return validated + + def __call__(self, value: t.Any) -> Normalized: + """Validate and normalize in one step.""" + validated = self.validate(value) + if validated is None: + raise ValueError(f"Value {value!r} does not conform to type {self.__class__.__name__}") + return self.normalize(validated) + + +# ============================================================ +# Primitive Types +# ============================================================ +class StringType(DeclarativeType): + """ + String type validator. + + Accepts: + - Python str only + + Validation: Returns the string if valid, None otherwise. + Normalization: Returns the string as-is (identity). + """ + + def __init__(self, normalized_type: str = "str"): + """ + Args: + normalized_type: Target type for normalization. + - "literal": Convert to exp.Literal.string() + - "str": Keep as string (default) + - "identifier": Convert to exp.Identifier + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[str]: + """Check if value is a Python string. Returns string or None.""" + return value if isinstance(value, str) else None + + def normalize(self, validated: str) -> str: + """Return string as-is (identity normalization).""" + return validated + + +class LiteralType(DeclarativeType): + """ + Literal type validator. + + Accepts: + - exp.Literal only (from AST) + - String that can be parsed as literal + + Validation: Returns exp.Literal if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Literal (default) + - "literal": Keep as exp.Literal + - "str": Convert to Python string + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Literal]: + """Check if value is a literal type. Returns exp.Literal or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Literal + if isinstance(value, exp.Literal): + return value + + return None + + def normalize(self, validated: exp.Literal) -> t.Union[exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "str": + return validated.this + # None or "literal" - keep as-is + return validated + + +class IdentifierType(DeclarativeType): + """ + Identifier type validator. + + Accepts: + - exp.Identifier only + - String that can be parsed as identifier + + Validation: Returns exp.Identifier if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Identifier (default) + - "literal": Convert to exp.Literal.string() + - "str": Convert to Python string + - "identifier": Keep as exp.Identifier + - "column": Convert to exp.Column + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Identifier]: + """Check if value is an identifier type. Returns exp.Identifier or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's an Identifier + if isinstance(value, exp.Identifier): + return value + + return None + + def normalize( + self, validated: exp.Identifier + ) -> t.Union[exp.Identifier, exp.Column, exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "column": + return exp.column(validated.this) + if self.normalized_type == "literal": + return exp.Literal.string(validated.this) + if self.normalized_type == "str": + return validated.this + # None or "identifier" - keep as-is + return validated + + +class ColumnType(DeclarativeType): + """ + Column type validator. + + Accepts: + - exp.Column only + - String that can be parsed as column + + Validation: Returns exp.Column if valid, None otherwise. + Normalization: Converts to target type based on normalized_type parameter. + """ + + def __init__(self, normalized_type: t.Optional[str] = None): + """ + Args: + normalized_type: Target type for normalization. + - None: Keep as exp.Column (default) + - "literal": Convert to exp.Literal.string() + - "str": Convert to Python string + - "identifier": Convert to exp.Identifier + - "column": Keep as exp.Column + """ + self.normalized_type = normalized_type + + def validate(self, value: t.Any) -> t.Optional[exp.Column]: + """Check if value is a column type. Returns exp.Column or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Column + if isinstance(value, exp.Column): + return value + + return None + + def normalize( + self, validated: exp.Column + ) -> t.Union[exp.Column, exp.Identifier, exp.Literal, str]: + """Convert to target type based on normalized_type.""" + if self.normalized_type == "identifier": + return exp.Identifier(this=validated.this) + if self.normalized_type == "literal": + return exp.Literal.string(validated.this) + if self.normalized_type == "str": + return str(validated.this) + # None or "column" - keep as-is + return validated + + +class EqType(DeclarativeType): + """ + EQ expression type validator (key=value pairs). + + Accepts: + - exp.EQ(left, right) + - String that can be parsed as key=value + + Validation: Returns (key_name, value_expr) tuple if valid, None otherwise. + Normalization: Returns the (key, value) tuple as-is. + """ + + def validate(self, value: t.Any) -> t.Optional[t.Tuple[str, t.Any]]: + """Check if value is an EQ expression. Returns (key, value) tuple or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's an EQ expression + if isinstance(value, exp.EQ): + # Extract key name from left side + left = value.this + # Extract value from right side + right = value.expression + + key_name = None + if isinstance(left, exp.Column): + key_name = left.this.name if hasattr(left.this, "name") else str(left.this) + elif isinstance(left, exp.Identifier): + key_name = left.this + elif isinstance(left, str): + key_name = left + else: + key_name = str(left) + + return (key_name, right) + + return None + + def normalize(self, validated: t.Tuple[str, t.Any]) -> t.Tuple[str, t.Any]: + """Return (key, value) tuple as-is (identity normalization).""" + return validated + + +class EnumType(DeclarativeType): + """ + Enumerated value type validator. + + Accepts values from a predefined set of allowed values. + Following input types are allowed: + - str + - exp.Literal + - exp.Var + - exp.Identifier + - exp.Column + + Parameters: + ----------- + valid_values : t.Sequence[str] + List of allowed values (e.g., ["HASH", "RANDOM"]) + normalized_type : t.Optional[str] + Target type for normalization: + - "str": Python string (default) + - "identifier": exp.Identifier + - "literal": exp.Literal.string() + - "column": exp.Column + - "ast_expr": generic exp.Expression (defaults to Identifier) + case_sensitive : bool + Whether to perform case-sensitive matching (default: False) + + Validation: Checks if value is in allowed set, returns canonical string. + Normalization: Converts to specified target type. + """ + + def __init__( + self, + valid_values: t.Sequence[str], + normalized_type: str = "str", + case_sensitive: bool = False, + ): + self.valid_values = list(valid_values) + self.case_sensitive = bool(case_sensitive) + self.normalized_type = normalized_type + + if self.normalized_type is not None and self.normalized_type not in PROPERTY_OUTPUT_TYPES: + raise ValueError( + f"normalized_type must be one of {PROPERTY_OUTPUT_TYPES}, got {self.normalized_type!r}" + ) + + # Pre-compute normalized values for efficient lookup + self._values_normalized = [v if case_sensitive else v.upper() for v in self.valid_values] + + def _extract_text(self, value: t.Any) -> t.Optional[str]: + """Extract text from various value types.""" + if isinstance(value, str): + return value + if isinstance(value, (exp.Literal, exp.Var)): + return str(value.this) + if isinstance(value, (exp.Identifier, exp.Column)): + # For Identifier/Column, this might be another Expression + if isinstance(value.this, str): + return value.this + elif hasattr(value.this, "name"): # noqa: RET505 + return str(value.this.name) + else: + return str(value.this) + return None + + def _normalize_text(self, text: str) -> str: + """Normalize text for comparison based on case sensitivity.""" + return text if self.case_sensitive else text.upper() + + def validate(self, value: t.Any) -> t.Optional[str]: + """Check if value is in the allowed enum set. Returns canonical string or None.""" + # Try parsing string first + if isinstance(value, str): + try: + parsed = parse_fragment(value) + # If parsed successfully, extract text from AST node + if isinstance(parsed, (exp.Identifier, exp.Literal, exp.Column)): + value = parsed + except Exception: + # If parsing fails, treat as plain string + pass + + # Extract text from value + text = self._extract_text(value) + + if text is None: + return None + + # Normalize and check against allowed values + normalized_text = self._normalize_text(text) + if normalized_text in self._values_normalized: + return normalized_text + + return None + + def normalize(self, validated: str) -> Normalized: + """Convert validated enum string to target type.""" + # validated is already canonical (e.g., "HASH") + if self.normalized_type is None or self.normalized_type == "str": + return validated + if self.normalized_type == "var": + return exp.Var(this=validated) + if self.normalized_type == "literal": + return exp.Literal.string(validated) + if self.normalized_type == "identifier": + return exp.Identifier(this=validated) + if self.normalized_type == "column": + return exp.Column(this=validated) + if self.normalized_type == "ast_expr": + return exp.Identifier(this=validated) + + # Fallback to string + return validated + + +class FuncType(DeclarativeType): + """ + Function type validator. + + Accepts: + - exp.Func (built-in functions like date_trunc, CAST, etc.) + - exp.Anonymous (custom/dialect functions like RANGE, LIST) + - String that can be parsed as function call + + Validation: Returns exp.Func or exp.Anonymous if valid, None otherwise. + Normalization: Returns the function expression as-is (identity). + + Examples: + date_trunc('day', col1) → exp.Func + RANGE(col1, col2) → exp.Anonymous + LIST(region, status) → exp.Anonymous + """ + + def validate(self, value: t.Any) -> t.Optional[t.Union[exp.Func, exp.Anonymous]]: + """Check if value is a function type. Returns exp.Func/exp.Anonymous or None.""" + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Check if it's a Func or Anonymous function + if isinstance(value, (exp.Func, exp.Anonymous)): + return value + + return None + + def normalize( + self, validated: t.Union[exp.Func, exp.Anonymous] + ) -> t.Union[exp.Func, exp.Anonymous]: + """Return function expression as-is (identity normalization).""" + return validated + + +# ============================================================ +# AnyOf (combinator) +# ============================================================ +class AnyOf(DeclarativeType): + """ + Union type - accepts first matching subtype. + + This is a combinator type that tries each subtype in order and accepts + the first one that validates successfully. + + Validation: Tries each subtype, returns (matched_type, validated_value) tuple. + Normalization: Uses the matched subtype's normalize method. + """ + + def __init__(self, *types: DeclarativeType): + if not types: + raise ValueError("AnyOf requires at least one type") + + # Validate all types are DeclarativeType instances + for type_ in types: + if not isinstance(type_, DeclarativeType): + raise TypeError(f"AnyOf expects DeclarativeType instances, got {type_!r}") + + self.types: t.List[DeclarativeType] = list(types) + + def validate(self, value: t.Any) -> t.Optional[t.Tuple[DeclarativeType, Validated]]: + """Try each subtype in order, return (matched_type, validated_value) or None.""" + for sub_type in self.types: + validated = sub_type.validate(value) + if validated is not None: + # Return both the matched type and validated value + return (sub_type, validated) + + # No type matched + return None + + def normalize(self, validated: t.Tuple[DeclarativeType, Validated]) -> Normalized: + """Normalize using the matched subtype's normalize method.""" + matched_type, validated_value = validated + return matched_type.normalize(validated_value) + + +# ============================================================ +# SequenceOf (Tuple/List/Paren/Single -> normalized list/tuple) +# ============================================================ +class SequenceOf(DeclarativeType): + """ + Sequence/List type validator with built-in union type support. + + Accepts various sequence representations and validates each element against + one or more possible types (similar to AnyOf for each element). + Optionally accepts single elements (promoted to single-item lists). + + Accepts: + - exp.Tuple: (a, b, c) + - exp.Array: [a, b, c] + - exp.Paren: (a) or ((a, b)) + - Python list/tuple: [a, b] or (a, b) + - String: "a, b, c" (parsed) + - Single element: a (if allow_single=True, promoted to [a]) + + Validation: Returns list of (matched_type, validated_value) tuples or None. + Normalization: Returns list of normalized elements using matched type's normalize. + + Examples: + # Single type + SequenceOf(ColumnType()) + + # Multiple types (union) - each element tries types in order + SequenceOf(ColumnType(), IdentifierType(), LiteralType()) + + # Allow single element + SequenceOf(ColumnType(), allow_single=True) + + # Multiple types + allow single + SequenceOf(ColumnType(), IdentifierType(), allow_single=True) + """ + + def __init__( + self, + *elem_types: DeclarativeType, + allow_single: bool = False, + output_as: str = "list", + ): + """ + Args: + *elem_types: One or more type validators for elements. + If multiple types provided, each element tries types in order (AnyOf behavior). + allow_single: Whether to accept single elements (promoted to list). Default: False. + output_as: Output format - "list" or "tuple". Default: "list". + """ + if not elem_types: + raise ValueError("SequenceOf requires at least one element type") + + self.elem_types: t.List[DeclarativeType] = list(elem_types) + self.allow_single = allow_single + self.output_as = output_as + + def validate(self, value: t.Any) -> t.Optional[t.List[t.Tuple[DeclarativeType, Validated]]]: + """Validate each element in the sequence. Returns list of (matched_type, validated_value) tuples or None.""" + # Extract elements from various container types + elems = self._extract_elements(value) + if elems is None: + return None + + # Validate each element against all possible types (AnyOf behavior) + validated_items: t.List[t.Tuple[DeclarativeType, Validated]] = [] + for elem in elems: + # Try each type until one matches + matched = False + for elem_type in self.elem_types: + validated = elem_type.validate(elem) + if validated is not None: + validated_items.append((elem_type, validated)) + matched = True + break + + # If no type matched, the whole sequence fails if any element fails + if not matched: + return None + + return validated_items + + def normalize( + self, validated: t.List[t.Tuple[DeclarativeType, Validated]] + ) -> t.Union[t.List[Normalized], t.Tuple[Normalized, ...]]: + """Normalize each validated element using its matched type's normalize method.""" + normalized_items = [elem_type.normalize(value) for elem_type, value in validated] + + # Convert to desired output format + if self.output_as == "tuple": + return tuple(normalized_items) + return normalized_items # default: list + + def _extract_elements(self, value: t.Any) -> t.Optional[t.List[t.Any]]: + """ + Extract elements from various container representations. + Returns list of raw elements or None if extraction fails. + """ + # Python list/tuple - process first before string parsing + if isinstance(value, (list, tuple)): + return list(value) + + # Try parsing string for AST types + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + # If parsing fails and we accept single strings, promote to list + if self.allow_single and any(isinstance(t, StringType) for t in self.elem_types): + return [value] + return None + + # SQL Tuple: (a, b, c) + if isinstance(value, exp.Tuple): + return list(value.expressions) + + # SQL Array: [a, b, c] + if isinstance(value, exp.Array): + return list(value.expressions) + + # SQL Paren: (a) or ((a, b)) + if isinstance(value, exp.Paren): + inner = value.this + if isinstance(inner, exp.Tuple): + return list(inner.expressions) + return [inner] + + # Single AST element: promote to list (if allow_single) + if self.allow_single and isinstance(value, exp.Expression): + return [value] + + return None + + +# ============================================================ +# Field Definition for Structured Types +# ============================================================ +class Field: + """ + Field specification for StructuredTupleType. + + Defines validation rules, types, and metadata for a single field. + + Args: + type: DeclarativeType instance for validating field value + required: Whether this field is required (default: False) + aliases: List of alternative field names (default: []) + doc: Documentation string for this field + + Example: + Field( + type=EnumType(["HASH", "RANDOM"]), + required=True, + aliases=["distribution_type"], + doc="Distribution kind: HASH or RANDOM" + ) + """ + + def __init__( + self, + type: DeclarativeType, + required: bool = False, + aliases: t.Optional[t.List[str]] = None, + doc: t.Optional[str] = None, + ): + self.type = type + self.required = required + self.aliases = aliases or [] + self.doc = doc + + +# ============================================================ +# StructuredTupleType - Base class for typed tuples +# ============================================================ +class StructuredTupleType(DeclarativeType): + """ + Base class for validating tuples with typed fields. + + Subclasses define FIELDS dict to specify structure: + + FIELDS = { + "field_name": Field( + type=SomeType(), + required=True, + aliases=["alt_name1", "alt_name2"] + ), + ... + } + + Validation Process: + 1. Parse tuple into key=value pairs (exp.EQ) + 2. Match keys against FIELDS (including aliases) + 3. Validate each field value with specified type + 4. Check required fields are present + 5. Handle unknown/invalid fields based on error flags + + Returns: Dict[str, Any] with canonical field names as keys + + Example: + class DistributionTupleInputType(StructuredTupleType): + FIELDS = { + "kind": Field(type=EnumType(["HASH", "RANDOM"]), required=True), + "columns": Field(type=SequenceOf(ColumnType())), + } + + Args: + error_on_unknown_field: If True, raise error when encountering unknown fields. + If False, silently skip unknown fields (default: False) + error_on_invalid_field: If True, raise error when field value validation fails. + If False, return None for entire validation (default: True) + """ + + FIELDS: t.Dict[str, Field] = {} # Subclasses override this + + def __init__(self, error_on_unknown_field: bool = True, error_on_invalid_field: bool = True): + self.error_on_unknown_field = error_on_unknown_field + self.error_on_invalid_field = error_on_invalid_field + + # Build alias mapping: alias -> canonical_name + self._alias_map: t.Dict[str, str] = {} + for field_name, field_spec in self.FIELDS.items(): + # Map canonical name to itself + self._alias_map[field_name] = field_name + # Map aliases to canonical name + for alias in field_spec.aliases: + self._alias_map[alias] = field_name + + def validate( + self, value: t.Any + ) -> t.Optional[t.Dict[str, t.Tuple[DeclarativeType, Validated]]]: + """ + Validate structured tuple. + + Returns: Dict mapping canonical field names to (matched_type, validated_value) tuples, + or None if validation fails. + + Raises: + ValueError: If error_on_unknown_field=True and unknown field encountered + ValueError: If error_on_invalid_field=True and field validation fails + """ + # Try parsing string first + if isinstance(value, str): + try: + value = parse_fragment(value) + except Exception: + return None + + # Extract key=value pairs from tuple/paren + pairs = self._extract_pairs(value) + if pairs is None: + return None + + # Validate each pair and build result dict + result: t.Dict[str, t.Tuple[DeclarativeType, Validated]] = {} + eq_type = EqType() + + for pair_expr in pairs: + # Validate as EQ expression + eq_validated = eq_type.validate(pair_expr) + if eq_validated is None: + continue # Skip non-EQ expressions + + key, value_expr = eq_validated + + # Resolve alias to canonical name + canonical_name = self._alias_map.get(key) + if canonical_name is None: + # Unknown field + if self.error_on_unknown_field: + raise ValueError( + f"Unknown field '{key}' in {self.__class__.__name__}. " + f"Valid fields: {list(self.FIELDS.keys())}" + ) + # Skip unknown field + continue + + # Get field spec + field_spec = self.FIELDS[canonical_name] + + # Validate field value with specified type + validated_value = field_spec.type.validate(value_expr) + if validated_value is None: + # Field validation failed + if self.error_on_invalid_field: + raise ValueError( + f"Invalid value for field '{canonical_name}': {value_expr}. " + f"Expected type: {field_spec.type.__class__.__name__}, " + f"Actual type: {type(value_expr).__name__}" + ) + # Return None for entire validation + return None + + # Store with canonical name + result[canonical_name] = (field_spec.type, validated_value) + + # Check required fields + for field_name, field_spec in self.FIELDS.items(): + if field_spec.required and field_name not in result: + # Required field missing + if self.error_on_invalid_field: + raise ValueError( + f"Required field '{field_name}' is missing in {self.__class__.__name__}" + ) + return None + + return result + + def normalize( + self, validated: t.Dict[str, t.Tuple[DeclarativeType, Validated]] + ) -> t.Dict[str, Normalized]: + """ + Normalize validated fields. + + Returns: Dict mapping canonical field names to normalized values. + """ + return { + field_name: field_type.normalize(value) + for field_name, (field_type, value) in validated.items() + } + + def _extract_pairs(self, value: t.Any) -> t.Optional[t.List[t.Any]]: + """ + Extract list of expressions from tuple/paren. + Each expression should be an exp.EQ (key=value). + """ + # exp.Tuple: (a=1, b=2) + if isinstance(value, list): + return value + if isinstance(value, exp.Tuple): + return list(value.expressions) + + # exp.Paren: (a=1) or ((a=1, b=2)) + if isinstance(value, exp.Paren): + inner = value.this + if isinstance(inner, exp.Tuple): + return list(inner.expressions) + return [inner] + + return None + + +class DistributionTupleInputType(StructuredTupleType): + """ + StarRocks distribution tuple validator. + + Accepts: + - (kind='HASH', columns=(id, dt), buckets=10) + - (kind='HASH', expressions=(id, dt), bucket_num=10) + - (kind='RANDOM') + + Returns: Dict with fields: + - kind: "HASH" or "RANDOM" (string) + - columns: List[exp.Column] (optional, for HASH) + - buckets: exp.Literal (optional) + + Field Aliases: + - columns: expressions + - buckets: bucket, bucket_num + + Examples: + Input: (kind='HASH', columns=(id, dt), buckets=10) + Output: { + 'kind': 'HASH', + 'columns': [exp.Column('id'), exp.Column('dt')], + 'buckets': exp.Literal.number(10) + } + + Input: (kind='RANDOM') + Output: {'kind': 'RANDOM'} + + Conversion: + Use factory methods to convert normalized values to unified dict format: + - from_enum(): Convert EnumType normalized value (str) → dict + - from_func(): Convert FuncType normalized value (exp.Func) → dict + - to_unified_dict(): Convert any normalized value → dict + """ + + FIELDS = { + "kind": Field( + type=EnumType(["HASH", "RANDOM"], normalized_type="str"), + required=True, + doc="Distribution type: HASH or RANDOM", + ), + "columns": Field( + type=SequenceOf( + ColumnType(), + IdentifierType(normalized_type="column"), + allow_single=True, + ), + required=False, + aliases=["expressions"], + doc="Columns for HASH distribution", + ), + "buckets": Field( + type=AnyOf(LiteralType(), StringType(normalized_type="literal")), + required=False, + aliases=["bucket", "bucket_num"], + doc="Number of buckets", + ), + } + + +class DistributionTupleOutputType(StructuredTupleType): + """ + Output validator for distribution tuple. + + Used to validate normalized distribution values which are already dicts. + Overrides validate() to handle dict input directly (for output validation), + while parent class handles tuple/string input (for input validation). + """ + + FIELDS = { + "kind": Field( + type=EnumType(["HASH", "RANDOM"]), + required=True, + ), + "columns": Field( + type=SequenceOf(ColumnType(), allow_single=False), + required=False, + ), + "buckets": Field( + type=LiteralType(), + required=False, + ), + } + + def validate(self, value: t.Any) -> t.Optional[t.Dict[str, t.Any]]: + """ + Validate a distribution value for OUTPUT validation. + + For output validation, accepts: + - dict: Validate structure directly (normalized output) + - tuple/string: Delegate to parent class (for completeness) + + Returns: The dict if valid, None otherwise + """ + # For output validation, handle dict directly + if isinstance(value, dict): + # Validate required 'kind' field + kind = value.get("kind") + if kind is None: + return None + + # Validate 'kind' is a valid enum value + kind_spec = self.FIELDS["kind"].type + if kind_spec.validate(kind) is None: + return None + + # Validate 'columns' if present + columns = value.get("columns") + if columns is not None: + columns_spec = self.FIELDS["columns"].type + if columns_spec.validate(columns) is None: + return None + + # Validate 'buckets' if present + buckets = value.get("buckets") + if buckets is not None: + buckets_spec = self.FIELDS["buckets"].type + if buckets_spec.validate(buckets) is None: + return None + + return value + + # For tuple/string, delegate to parent class + return super().validate(value) + + # ============================================================ + # Factory methods for conversion from other normalized types + # ============================================================ + + @staticmethod + def from_enum(enum_value: str, buckets: t.Optional[int] = None) -> t.Dict[str, t.Any]: + """ + Create distribution dict from EnumType normalized value. + + Args: + enum_value: "RANDOM" (from EnumType) + buckets: Optional bucket count + + Returns: + Dict with kind/columns/buckets fields + + Example: + >>> DistributionTupleOutputType.from_enum("RANDOM") + {'kind': 'RANDOM', 'columns': [], 'buckets': None} + """ + return {"kind": enum_value, "columns": [], "buckets": buckets} + + @staticmethod + def from_func( + func: t.Union[exp.Func, exp.Anonymous], buckets: t.Optional[int] = None + ) -> t.Dict[str, t.Any]: + """ + Create distribution dict from FuncType normalized value. + + Args: + func: HASH(id, dt) or RANDOM() (from FuncType) + buckets: Optional bucket count + + Returns: + Dict with kind/columns/buckets fields + + Example: + >> func = sqlglot.parse_one("HASH(id, dt)") + >> DistributionTupleOutputType.from_func(func) + {"kind": "HASH", "columns": [exp.Column("id"), exp.Column("dt")], "buckets": None} + """ + func_name = func.name.upper() if hasattr(func, "name") else str(func.this).upper() + + if func_name == "HASH": + # Extract columns from HASH(col1, col2, ...) + columns: list[exp.Column] = [func.this] if isinstance(func.this, exp.Column) else [] + columns.extend(func.expressions) + return {"kind": "HASH", "columns": columns, "buckets": buckets} + elif func_name == "RANDOM": # noqa: RET505 + return {"kind": "RANDOM", "columns": [], "buckets": buckets} + else: + raise ValueError(f"Unknown distribution function: {func_name}") + + @staticmethod + def to_unified_dict( + normalized_value: t.Any, buckets: t.Optional[int] = None + ) -> t.Dict[str, t.Any]: + """ + Convert any normalized distribution value to unified dict format. + + This is a convenience method that dispatches to appropriate factory method. + + Args: + normalized_value: Result from DistributedByInputSpec normalization + (dict | str | exp.Func) + buckets: Optional bucket count override + + Returns: + Unified dict with kind/columns/buckets fields + + Raises: + TypeError: If value type is not supported + + Example: + >>> # From DistributionTupleOutputType + >>> DistributionTupleOutputType.to_unified_dict({"kind": "HASH", "columns": [...]}) + {'kind': 'HASH', 'columns': [Ellipsis]} + + >>> # From EnumType + >>> DistributionTupleOutputType.to_unified_dict("RANDOM") + {'kind': 'RANDOM', 'columns': [], 'buckets': None} + + >> # From FuncType + >> DistributionTupleOutputType.to_unified_dict(sqlglot.parse_one("HASH(id)")) + {'kind': 'HASH', 'columns': [exp.Column('id')], 'buckets': None} + """ + if isinstance(normalized_value, dict): + # Already in DistributionTupleInputType format + return normalized_value + elif isinstance(normalized_value, str): # noqa: RET505 + # From EnumType: "RANDOM" + return DistributionTupleOutputType.from_enum(normalized_value, buckets) + elif isinstance(normalized_value, (exp.Func, exp.Anonymous)): + # From FuncType: HASH(id, dt) + return DistributionTupleOutputType.from_func(normalized_value, buckets) + else: + raise TypeError( + f"Cannot convert {type(normalized_value).__name__} to distribution dict. " + f"Expected dict, str, or exp.Func/exp.Anonymous." + ) + + +# ============================================================ +# Type Specifications for StarRocks Properties (INPUT and OUTPUT) +# ============================================================ +class PropertySpecs: + # Accepts: + # - Single column: id + # - Multiple columns: (id, dt) + # - String for string input: "id, dt" (will be auto-wrapped and parsed by preprocess_parentheses) + GeneralColumnListInputSpec = SequenceOf( + ColumnType(), + StringType(normalized_type="column"), + IdentifierType(normalized_type="column"), + allow_single=True, + ) + + # TableKey: Simple key specification (primary_key, duplicate_key, unique_key, aggregate_key) + # Accepts: + # - Single column: id + # - Multiple columns: (id, dt) + TableKeyInputSpec = GeneralColumnListInputSpec + + # Partitioned By: Flexible partition specification + # Accepts: + # - Single column: col1 + # - Multiple columns: (col1, col2) + # - Mixed: (col1, "col2") - string will be parsed + # - RANGE(col1) or RANGE(col1, col2) + # - LIST(col1) or LIST(col1, col2) + # - Expression: (date_trunc('day', col1), col2) + PartitionedByInputSpec = SequenceOf( + ColumnType(), + StringType(normalized_type="column"), + IdentifierType(normalized_type="column"), + FuncType(), # RANGE(), LIST(), date_trunc(), etc. + allow_single=True, + ) + + # Partitions: List of partition definitions (strings) + # Accepts: + # - Single partition: 'PARTITION p1 VALUES LESS THAN ("2024-01-01")' + # - Multiple partitions: ('PARTITION p1 ...', 'PARTITION p2 ...') + # Note: Single string is auto-promoted to list + PartitionsInputSpec = SequenceOf( + StringType(), LiteralType(normalized_type="str"), allow_single=True + ) + + # Distribution: StarRocks distribution specification + # Accepts: + # - Structured tuple1: (kind='HASH', columns=(id, dt), buckets=10) + # - Structured tuple2: (kind='RANDOM') + # - String format: "HASH(id)", "RANDOM", or "(kind='HASH', columns=(id), buckets=10)" + # Note: Does NOT accept simple columns like id or (id, dt) + # And it can't directly accept "HASH(id) BUCKETS 10", you need to split it with "BUCKETS" to two parts. + DistributedByInputSpec = AnyOf( + DistributionTupleInputType(), # Try structured tuple first (most specific) + EnumType(["RANDOM"], normalized_type="str"), # "RANDOM" + FuncType(), # "HASH(id)", + ) + + # OrderBy: Simple ordering specification + # Accepts: + # - Single column: dt + # - Multiple columns: (dt, id, status) + OrderByInputSpec = GeneralColumnListInputSpec + + # Refresh scheme: Accepts various types, normalizes to string + # For properties like refresh_scheme, it can be a string, identifier, or column + RefreshSchemeInputSpec = AnyOf( + EnumType(["ASYNC", "MANUAL"], normalized_type="var"), + ColumnType(normalized_type="str"), # Columns → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + LiteralType(normalized_type="str"), # Numbers and string → to string + StringType(), # Plain strings + ) + + # Generic property value: Accepts various types, normalizes to string + # For properties like replication_num, storage_medium, etc. + # StarRocks PROPERTIES syntax requires all values to be strings: "value" + # So we normalize everything to string for consistent SQL generation + GenericPropertyInputSpec = AnyOf( + StringType(), # Plain strings + LiteralType(normalized_type="str"), # Numbers and string → will be converted to string + IdentifierType(normalized_type="str"), # Identifiers → will be converted to string + ColumnType(normalized_type="str"), # Columns → will be converted to string + ) + + """ + Input Property Specification for StarRocks + + This specification defines the validation and normalization rules for StarRocks properties. + Properties are specified in the physical_properties block of a SQLMesh model. + + Supported properties: + - partitioned_by / partition_by: Partition specification + - partitions: List of partition definitions + - distributed_by: Distribution specification (HASH/RANDOM with structured tuple or string) + - order_by: Ordering specification (simple column list) + - table key: + - primary_key: Primary key columns + - duplicate_key: Duplicate key columns (for DUPLICATE KEY table) + - unique_key: Unique key columns (for UNIQUE KEY table) + - aggregate_key: Aggregate key columns (for AGGREGATE KEY table) + - other properties: Any other properties not listed above will be treated as generic + string properties (e.g., replication_num, storage_medium, etc.) + + Examples: + duplicate_key = dt # Single key + primary_key = (id, customer_id) # Multiple keys + + partitioned_by = col1 # Single column + partitioned_by = (col1, col2) # Multiple columns + partitioned_by = (col1, "col2") # Mixed (string will be parsed) + partitioned_by = date_trunc('day', col1) # Expression partition with single func + partitioned_by = (date_trunc('day', col1), col2) # Expression partition with multiple exprs + partitioned_by = RANGE(col1, col2) # RANGE partition + partitioned_by = LIST(region, status) # LIST partition + + distributed_by = (kind='HASH', columns=(id, dt), buckets=10) # Structured + distributed_by = (kind='RANDOM') # RANDOM distribution + distributed_by = "HASH(id)" # String format + distributed_by = "RANDOM" # String format + + order_by = dt # Single column + order_by = (dt, id, status) # Multiple columns + + replication_num = 3 # Generic property (auto-handled) + storage_medium = "SSD" # Generic property (auto-handled) + """ + PROPERTY_INPUT_SPECS: t.Dict[str, DeclarativeType] = { + # Table key properties + "primary_key": TableKeyInputSpec, + "duplicate_key": TableKeyInputSpec, + "unique_key": TableKeyInputSpec, + "aggregate_key": TableKeyInputSpec, + # Partition-related properties + "partitioned_by": PartitionedByInputSpec, + "partitions": PartitionsInputSpec, + # Distribution property + "distributed_by": DistributedByInputSpec, + # Ordering property + "clustered_by": OrderByInputSpec, + # View properties + # StarRocks syntax: SECURITY {NONE | INVOKER | DEFINER} + "security": EnumType(["NONE", "INVOKER", "DEFINER"], normalized_type="str"), + # Materialized view refresh properties (StarRocks uses REFRESH ...) + # - refresh_moment: IMMEDIATE | DEFERRED + "refresh_moment": EnumType(["IMMEDIATE", "DEFERRED"], normalized_type="str"), + # - refresh_scheme: ASYNC | ASYNC [START (...) EVERY (INTERVAL ...)] | MANUAL + # it should be a string/literal if START/EVERY is present, other than ASYNC + "refresh_scheme": RefreshSchemeInputSpec, + # Note: All other properties not listed here will be handled, an example here + "replication_num": GenericPropertyInputSpec, + } + + # Default output spec for properties not in PROPERTY_OUTPUT_SPECS + GenericPropertyOutputSpec = StringType() + + """ + Output Property Specification for StarRocks after validation+normalization + + This specification describes the expected types after normalization. + For most properties, OUTPUT spec is the same as INPUT spec since normalization + preserves the diverse types (dict | str | exp.Func for distribution). + + Conversion to unified formats (e.g., all distributions → dict) happens separately + in the usage layer via factory methods like DistributionTupleInputType.to_unified_dict(). + + Expected Output Types (after normalization): + - table keys: List[exp.Expression] - columns + - partitioned_by: List[exp.Expression] - columns, functions + - partitions: List[str] - partition definition strings + - distributed_by: Dict | str | exp.Func - DistributionTupleInputType, EnumType, or FuncType output + - order_by: List[exp.Expression] - columns + - generic properties: str - normalized string values + """ + GeneralColumnListOutputSpec: DeclarativeType = SequenceOf(ColumnType(), allow_single=False) + + PROPERTY_OUTPUT_SPECS: t.Dict[str, DeclarativeType] = { + "primary_key": GeneralColumnListOutputSpec, + "duplicate_key": GeneralColumnListOutputSpec, + "unique_key": GeneralColumnListOutputSpec, + "aggregate_key": GeneralColumnListOutputSpec, + "partitioned_by": SequenceOf(ColumnType(), FuncType(), allow_single=False), + "partitions": SequenceOf(StringType(), allow_single=False), + "distributed_by": AnyOf( + DistributionTupleOutputType(), # Try structured tuple first (most specific) + EnumType(["RANDOM"], normalized_type="str"), # "RANDOM" + FuncType(), # "HASH(id)", + ), # Still dict | str | exp.Func after normalize + "clustered_by": GeneralColumnListOutputSpec, + "security": EnumType(["NONE", "INVOKER", "DEFINER"], normalized_type="str"), + "refresh_moment": EnumType(["IMMEDIATE", "DEFERRED"], normalized_type="str"), + "refresh_scheme": AnyOf( + EnumType(["ASYNC", "MANUAL"], normalized_type="var"), + StringType(), + ), + # Generic properties use GenericPropertyOutputSpec, an example here + "replication_num": GenericPropertyOutputSpec, + } + + # ============================================================ + # Helper functions + # ============================================================ + + @staticmethod + def get_property_input_spec(property_name: str) -> DeclarativeType: + """ + Get the INPUT type validator for a property. + + Returns the specific type from PROPERTY_INPUT_SPECS if defined, + otherwise returns GenericPropertyInputSpec for unknown properties. + + This allows any property not explicitly defined to be treated + as a generic string property. + """ + return PropertySpecs.PROPERTY_INPUT_SPECS.get( + property_name, PropertySpecs.GenericPropertyInputSpec + ) + + @staticmethod + def get_property_output_spec(property_name: str) -> DeclarativeType: + """ + Get the OUTPUT type validator for a property. + + Returns the specific type from PROPERTY_OUTPUT_SPECS if defined, + otherwise returns GenericPropertyOutputSpec for unknown properties. + + This allows validating that normalized values conform to expected output types. + """ + return PropertySpecs.PROPERTY_OUTPUT_SPECS.get( + property_name, PropertySpecs.GenericPropertyOutputSpec + ) + + +# ============================================================ +# Property Validation Helpers +# ============================================================ +class PropertyValidator: + """ + Centralized property validation helpers for table properties. + + Provides reusable validation functions to avoid code duplication + and ensure consistent error messages across different property handlers. + """ + + TABLE_KEY_TYPES = {"primary_key", "duplicate_key", "unique_key", "aggregate_key"} + + # All important properties except generic properties + IMPORTANT_PROPERTY_NAMES = { + *TABLE_KEY_TYPES, + "partitioned_by", + "partitions", + "distributed_by", + "clustered_by", + } + + # Centralized property alias configuration + # Maps canonical name -> list of valid aliases + PROPERTY_ALIASES: t.Dict[str, t.Set[str]] = { + "partitioned_by": {"partition_by"}, + "clustered_by": {"order_by"}, + } + + EXCLUSIVE_PROPERTY_NAME_MAP: t.Dict[str, t.Set[str]] = { + "key_type": set(TABLE_KEY_TYPES), + **PROPERTY_ALIASES, + } + + # Centralized invalid property name configuration + # Maps canonical name -> list of invalid/deprecated names + INVALID_PROPERTY_NAME_MAP: t.Dict[str, t.List[str]] = { + "partitioned_by": ["partition"], + "distributed_by": ["distribution", "distribute"], + "clustered_by": ["order", "ordering"], + } + + @staticmethod + def ensure_parenthesized(value: t.Any) -> t.Any: + """ + Ensure string value is wrapped in parentheses for parse_fragment compatibility. + + For string inputs like 'id1, id2', wraps to '(id1, id2)' so that + parse_fragment can parse it correctly. + + Args: + value: Input value (string, expression, or other) + + Returns: + - For strings/Literal/Column(quoted): wrapped in parentheses if not already + - For other types: returned unchanged + + Example: + >>> PropertyValidator.ensure_parenthesized('id1, id2') + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized('(id1, id2)') + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized(exp.Literal.string('id1, id2')) + '(id1, id2)' + >>> PropertyValidator.ensure_parenthesized(exp.Column(quoted=True, name='id1, id2')) + Column(quoted=True, name=id1, id2) + """ + # logger.debug("ensure_parenthesized. value: %s, type: %s", value, type(value)) + + # Extract string content from Literal + if isinstance(value, exp.Literal) and value.is_string: + value = value.this + # Extract string content from Column (quoted) + elif isinstance(value, exp.Column) and hasattr(value.this, "quoted") and value.this.quoted: + value = value.name # Column.name returns the string + elif not isinstance(value, str): + return value + + stripped = value.strip() + if not stripped: + return value + + # Check if already wrapped in parentheses + if stripped.startswith("(") and stripped.endswith(")"): + return value + + return f"({stripped})" + + @staticmethod + def validate_and_normalize_property( + property_name: str, value: t.Any, preprocess_parentheses: bool = False + ) -> t.Any: + """ + Complete property processing pipeline using SPEC: + 1. Optionally preprocess string with parentheses + 2. Get INPUT type validator + 3. Validate and normalize input value + 4. Get OUTPUT type validator + 5. Verify normalized output conforms to expected type + 6. Return verified output + + After validation, the output type is guaranteed by SPEC. + Unexpected types indicate SPEC configuration errors. + + Args: + property_name: Name of the property + value: The property value to validate + preprocess_parentheses: If True, wrap string values in parentheses + + Returns: + The normalized value + + Raises: + SQLMeshError: If validation fails + + Example: + >>> validated = PropertyValidator.validate_and_normalize_property("distributed_by", "RANDOM") + >>> # Result: "RANDOM" (string from EnumType) + """ + # logger.debug("validate_and_normalize_property. value: %s, type: %s", value, type(value)) + + # Step 1: Optionally preprocess string with parentheses + if preprocess_parentheses: + value = PropertyValidator.ensure_parenthesized(value) + + # Step 2: Get INPUT type validator + input_spec = PropertySpecs.get_property_input_spec(property_name) + if input_spec is None: + raise SQLMeshError(f"Unknown property '{property_name}'.") + + # Step 3: Validate + validated = input_spec.validate(value) + if validated is None: + raise SQLMeshError(f"Invalid value type for property '{property_name}': {value!r}.") + + # Step 4: Normalize + normalized = input_spec.normalize(validated) + + # Step 5: Check by using output spec + output_spec = PropertySpecs.get_property_output_spec(property_name) + if output_spec is not None: + if output_spec.validate(normalized) is None: + raise SQLMeshError( + f"Normalized value for property '{property_name}' doesn't match output spec: {normalized!r}." + ) + + # Step 6: Return + return normalized + + @staticmethod + def check_invalid_names( + valid_name: str, + invalid_names: t.List[str], + table_properties: t.Dict[str, t.Any], + suggestion: t.Optional[str] = None, + ) -> None: + """ + Check for invalid/deprecated property names and raise error with suggestion. + + Args: + valid_name: The correct property name + invalid_names: List of invalid/deprecated names to check for + table_properties: Table properties dictionary to check + suggestion: Optional custom error message suggestion + + Raises: + SQLMeshError: If any invalid name is found + + Example: + >> PropertyValidator.check_invalid_names( + ... valid_name="partitioned_by", + ... invalid_names=["partition_by", "partition"], + ... table_properties={"partition_by": "dt"} + ... ) + SQLMeshError: Invalid property 'partition_by'. Use 'partitioned_by' instead. + """ + for invalid_name in invalid_names: + if invalid_name in table_properties: + msg = suggestion or f"Use '{valid_name}' instead" + raise SQLMeshError(f"Invalid property '{invalid_name}'. {msg}.") + + @classmethod + def check_all_invalid_names(cls, table_properties: t.Dict[str, t.Any]) -> None: + """ + Check all invalid property names at once using INVALID_PROPERTY_NAME_MAP config. + + Args: + table_properties: Table properties dictionary to check + + Raises: + SQLMeshError: If any invalid name is found + """ + for valid_name, invalid_names in cls.INVALID_PROPERTY_NAME_MAP.items(): + cls.check_invalid_names(valid_name, invalid_names, table_properties) + + @staticmethod + def check_at_most_one( + property_name: str, + property_description: str, + table_properties: t.Dict[str, t.Any], + exclusive_property_names: t.Optional[t.Set[str]] = None, + parameter_value: t.Optional[t.Any] = None, + ) -> t.Optional[str]: + """ + Ensure at most one property from a mutually exclusive group is defined. + + Args: + property_name: the canonical name + property_description: description of the property group (for error messages) + exclusive_property_names: List of mutually exclusive property names. + Defaults to canonical name and aliases if not provided. + table_properties: Table properties dictionary to check + parameter_value: Optional parameter value (takes priority over table_properties) + + Returns: + Name of the active property, or None if none found + NOTE: If the parameter value is provided, it returns None + + Raises: + SQLMeshError: If multiple properties from the group are defined + + Example: + >> PropertyValidator.check_at_most_one( + ... property_name="primary_key", + ... property_description="key type", + ... exclusive_property_names=["primary_key", "duplicate_key", "unique_key", "aggregate_key"], + ... table_properties={"primary_key": "(id)", "duplicate_key": "(id)"} + ... ) + SQLMeshError: Multiple key type properties defined: ['primary_key', 'duplicate_key']. + Only one is allowed. + """ + if not exclusive_property_names: + exclusive_property_names = PropertyValidator.EXCLUSIVE_PROPERTY_NAME_MAP.get( + property_name, set() + ) | {property_name} + # logger.debug("Checking at most one property for '%s': %s", property_name, exclusive_property_names) + # Check parameter first (highest priority) + if parameter_value is not None: + # Check if any conflicting properties exist in table_properties + conflicts = [name for name in exclusive_property_names if name in table_properties] + if conflicts: + param_display = f"{property_name} (parameter)" + raise SQLMeshError( + f"Conflicting {property_description} definitions: " + f"{param_display} provided along with table_properties {conflicts}. " + f"Only one {property_description} is allowed." + ) + return None + + # Check table_properties for multiple definitions + present = [name for name in exclusive_property_names if name in table_properties] + # logger.debug("Get table key names for %s from table_properties: %s", property_name, present) + + if len(present) > 1: + raise SQLMeshError( + f"Multiple {property_description} properties defined: {present}. " + f"Only one is allowed." + ) + + return present[0] if present else None + + +############################################################################### +# StarRocks Engine Adapter +############################################################################### +@set_catalog() +class StarRocksEngineAdapter( + LogicalMergeMixin, + PandasNativeFetchDFSupportMixin, + ClusteredByMixin, +): + """ + StarRocks Engine Adapter for SQLMesh. + + StarRocks is a high-performance analytical database with its own dialect-specific + behavior. This adapter highlights a few key characteristics: + + 1. PRIMARY KEY support is native and must be emitted in the post-schema section. + 2. DELETE with subqueries is supported on PRIMARY KEY tables, but other key types still + need guard rails (no boolean literals, TRUNCATE for WHERE TRUE, etc.). + 3. Partitioning supports RANGE, LIST, and expression-based syntaxes. + + Implementation strategy: + - Override only where StarRocks syntax/behavior diverges from the base adapter. + - Keep the rest of the functionality delegated to the shared base implementation. + """ + + # ==================== Class Attributes (Declarative Configuration) ==================== + + DIALECT = "starrocks" + """SQLGlot dialect name for SQL generation""" + + DEFAULT_BATCH_SIZE = 10000 + """Default batch size for bulk operations""" + + SUPPORTS_TRANSACTIONS = False + """ + StarRocks does not support transactions for multiple DML statements. + - No BEGIN/COMMIT/ROLLBACK (only txn for multiple INSERT statements from v3.5) + - Operations are auto-committed + - Backfill uses partition-level atomicity + """ + + INSERT_OVERWRITE_STRATEGY = InsertOverwriteStrategy.DELETE_INSERT + """ + StarRocks does support INSERT OVERWRITE syntax (and dynamic overwrite from v3.5). + Use DELETE + INSERT pattern: + 1. DELETE FROM table WHERE condition + 2. INSERT INTO table SELECT ... + + Base class automatically handles this strategy without overriding insert methods. + + TODO: later, we can add support for INSERT OVERWRITE, even use Primary Key for beter performance + """ + + COMMENT_CREATION_TABLE = CommentCreationTable.IN_SCHEMA_DEF_CTAS + """Table comments are added in both CREATE TABLE statement and CTAS""" + + COMMENT_CREATION_VIEW = CommentCreationView.IN_SCHEMA_DEF_NO_COMMANDS + """View comments are added in CREATE VIEW statement""" + + SUPPORTS_MATERIALIZED_VIEWS = True + """StarRocks supports materialized views with refresh strategies""" + + SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True + """ + StarRocks materialized views support specifying a column list, but the column definition is + limited (e.g. column name + comment, not full type definitions). We set this to True and + implement custom MV schema rendering in create_view/_create_materialized_view. + """ + + SUPPORTS_REPLACE_TABLE = False + """No REPLACE TABLE syntax; use DROP + CREATE instead""" + + SUPPORTS_CREATE_DROP_CATALOG = False + """StarRocks supports DROPing external catalogs. + TODO: whether it's external catalogs, or includes the internal catalog + """ + + SUPPORTS_INDEXES = True + """ + StarRocks supports PRIMARY KEY in CREATE TABLE, but NOT standalone CREATE INDEX. + + We set this to True to enable PRIMARY KEY generation in CREATE TABLE statements. + The create_index() method is overridden to prevent actual CREATE INDEX execution. + + Supported (defined in CREATE TABLE): + - PRIMARY KEY: Automatically creates sorted index + - INDEX clause: For bloom filter, bitmap, inverted indexes + NOT supported: + CREATE INDEX idx_name ON t (name); -- Will be skipped by create_index() + """ + + SUPPORTS_TUPLE_IN = False + """ + StarRocks does NOT support tuple IN syntax: (col1, col2) IN ((val1, val2), (val3, val4)) + + Instead, use OR with AND conditions: + (col1 = val1 AND col2 = val2) OR (col1 = val3 AND col2 = val4) + + This is automatically handled by snapshot_id_filter and snapshot_name_version_filter + in sqlmesh/core/state_sync/db/utils.py when SUPPORTS_TUPLE_IN = False. + """ + + MAX_TABLE_COMMENT_LENGTH = 2048 + """Maximum length for table comments""" + + MAX_COLUMN_COMMENT_LENGTH = 255 + """Maximum length for column comments""" + + MAX_IDENTIFIER_LENGTH = 64 + """Maximum length for table/column names""" + + # ==================== Schema Operations ==================== + # StarRocks supports CREATE/DROP SCHEMA the same as CREATE/DROP DATABSE. + # So, no need to implement create_schema / drop_schema + + # ==================== Data Object Query ==================== + def _get_data_objects( + self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None + ) -> t.List[DataObject]: + """ + Returns all the data objects that exist in the given schema. + Uses information_schema tables which are compatible with MySQL protocol. + + StarRocks uses the MySQL-compatible information_schema layout, so the same query + works here. + Note: Materialized View is not reliably distinguished from View (both may appear as `VIEW`) + in information_schema.tables. We therefore best-effort detect MVs via + information_schema.materialized_views and upgrade matching objects to `materialized_view`. + + Args: + schema_name: The schema (database) to query + object_names: Optional set of specific table names to filter + + Returns: + List of DataObject instances representing tables and views + """ + schema_db = to_schema(schema_name).db + query = ( + exp.select( + exp.column("table_schema").as_("schema_name"), + exp.column("table_name").as_("name"), + exp.case(exp.column("table_type")) + .when( + exp.Literal.string("BASE TABLE"), + exp.Literal.string("table"), + ) + .when( + exp.Literal.string("VIEW"), + exp.Literal.string("view"), + ) + .else_("table_type") + .as_("type"), + ) + .from_(exp.table_("tables", db="information_schema")) + .where(exp.column("table_schema").eq(schema_db)) + ) + if object_names: + # StarRocks may treat information_schema table_name comparisons as case-sensitive. + # Use LOWER(table_name) to match case-insensitively. + lowered_names = [name.lower() for name in object_names] + query = query.where(exp.func("LOWER", exp.column("table_name")).isin(*lowered_names)) + + df = self.fetchdf(query) + objects = [ + DataObject( + schema=row.schema_name, + name=row.name, + type=DataObjectType.from_str(str(row.type)), + ) + for row in df.itertuples() + ] + + # Best-effort upgrade of MV types using information_schema.materialized_views. + # If this fails (unsupported / permissions / version), fall back to information_schema.tables. + try: + mv_query = ( + exp.select( + exp.column("table_schema").as_("schema_name"), + exp.column("table_name").as_("name"), + ) + .from_(exp.table_("materialized_views", db="information_schema")) + .where(exp.column("table_schema").eq(schema_db)) + ) + if object_names: + lowered_names = [name.lower() for name in object_names] + mv_query = mv_query.where( + exp.func("LOWER", exp.column("table_name")).isin(*lowered_names) + ) + + mv_df = self.fetchdf(mv_query) + mv_names: t.Set[str] = { + t.cast(str, r.name).lower() for r in mv_df.itertuples() if r.name + } + + if mv_names: + for obj in objects: + if obj.name.lower() in mv_names: + obj.type = DataObjectType.MATERIALIZED_VIEW + except Exception: + logger.warning( + f"[StarRocks] Failed to get materialized views from information_schema.materialized_views" + ) + + return objects + + def create_index( + self, + table_name: TableName, + index_name: str, + columns: t.Tuple[str, ...], + exists: bool = True, + ) -> None: + """ + Override to prevent CREATE INDEX statements (not supported in StarRocks). + + StarRocks does not support standalone CREATE INDEX statements. + Indexes must be defined during CREATE TABLE using INDEX clause. + + Since SQLMesh state tables use PRIMARY KEY (which provides efficient indexing), + we simply log and skip additional index creation requests. + + This matches upstream StarRocks limitations and prevents accidental CREATE INDEX calls. + """ + logger.warning( + f"[StarRocks] Skipping CREATE INDEX {index_name} on {table_name} - " + f"StarRocks does not support standalone CREATE INDEX statements. " + f"PRIMARY KEY provides equivalent indexing for columns: {columns}" + ) + return + + def _create_table_like( + self, + target_table_name: TableName, + source_table_name: TableName, + exists: bool, + **kwargs: t.Any, + ) -> None: + """Create a new table using StarRocks' native `CREATE TABLE ... LIKE ...` syntax. + + The base implementation re-creates the target table from `columns(source)` which can + lose non-column metadata. Using LIKE lets the engine preserve more of the original + table definition (engine-defined behavior). + """ + self.execute( + exp.Create( + this=exp.to_table(target_table_name), + kind="TABLE", + exists=exists, + properties=exp.Properties( + expressions=[ + exp.LikeProperty( + this=exp.to_table(source_table_name), + ), + ], + ), + ) + ) + + def delete_from( + self, + table_name: TableName, + where: t.Optional[t.Union[str, exp.Expression]] = None, + ) -> None: + """ + Delete from a table. + + StarRocks DELETE limitations by table type: + + PRIMARY KEY tables: + - Support complex WHERE conditions (subqueries, BETWEEN, etc.) + - No special handling needed + + Other table types (DUPLICATE/UNIQUE/AGGREGATE KEY): + - WHERE TRUE not supported → use TRUNCATE TABLE + - Boolean literals (TRUE/FALSE) not supported + - BETWEEN not supported → convert to >= AND <= + - Others not supported: + - CAST() not supported in WHERE + - Subqueries not supported + - ... + + But, I don't know what the table type is. + + Args: + table_name: The table to delete from + where: The where clause to filter rows to delete + """ + # Parse where clause if it's a string + where_expr: t.Optional[exp.Expression] + if isinstance(where, str): + from sqlglot import parse_one + + where_expr = parse_one(where, dialect=self.dialect) + else: + where_expr = where + + # If no where clause or WHERE TRUE, use TRUNCATE TABLE (for all table types) + if not where_expr or where_expr == exp.true(): + table_expr = exp.to_table(table_name) if isinstance(table_name, str) else table_name + logger.info( + f"Converting DELETE FROM {table_name} WHERE TRUE to TRUNCATE TABLE " + "(StarRocks does not support WHERE TRUE in DELETE)" + ) + self.execute(f"TRUNCATE TABLE {table_expr.sql(dialect=self.dialect, identify=True)}") + return + + # For non-PRIMARY KEY tables, apply WHERE clause restrictions + # Note: We conservatively apply restrictions to all tables since we can't easily + # determine table type at DELETE time. PRIMARY KEY tables will still work with + # simplified conditions, while non-PRIMARY KEY tables require them. + if isinstance(where_expr, exp.Expression): + original_where = where_expr + # Remove boolean literals (not supported in any table type) + where_expr = self._where_clause_remove_boolean_literals(where_expr) + # Convert BETWEEN to >= AND <= (required for DUPLICATE/UNIQUE/AGGREGATE KEY tables) + where_expr = self._where_clause_convert_between_to_comparison(where_expr) + + if where_expr != original_where: + logger.debug( + f"Converted WHERE clause for StarRocks compatibility, table: {table_name}.\n" + f" Original: {original_where.sql(dialect=self.dialect)}\n" + f" Converted: {where_expr.sql(dialect=self.dialect)}" + ) + + # Use parent implementation + super().delete_from(table_name, where_expr) + + def _where_clause_remove_boolean_literals(self, expression: exp.Expression) -> exp.Expression: + """ + Remove TRUE/FALSE boolean literals from WHERE expressions. + + StarRocks Limitation (except PRIMARY KEY tables): + Boolean literals (TRUE/FALSE) are not supported in WHERE clauses. + + This method simplifies expressions: + - (condition) AND TRUE / TRUE AND (condition) → condition + - (condition) OR FALSE / FALSE OR (condition) → condition + - WHERE TRUE → 1=1 (though TRUNCATE is used instead) + - WHERE FALSE → 1=0 + + Args: + expression: The expression to clean + + Returns: + Cleaned expression without boolean literals + """ + + def transform(node: exp.Expression) -> exp.Expression: + # Handle standalone TRUE/FALSE at the top level + if node == exp.true(): + # Convert TRUE to 1=1 + return exp.EQ(this=exp.Literal.number(1), expression=exp.Literal.number(1)) + elif node == exp.false(): # noqa: RET505 + # Convert FALSE to 1=0 + return exp.EQ(this=exp.Literal.number(1), expression=exp.Literal.number(0)) + + # Handle AND expressions + elif isinstance(node, exp.And): + left = node.this + right = node.expression + + # Remove TRUE from AND + if left == exp.true(): + return right + if right == exp.true(): + return left + + # Handle OR expressions + elif isinstance(node, exp.Or): + left = node.this + right = node.expression + + # Remove FALSE from OR + if left == exp.false(): + return right + if right == exp.false(): + return left + + return node + + # Transform the expression tree + return expression.transform(transform, copy=True) + + def _where_clause_convert_between_to_comparison( + self, expression: exp.Expression + ) -> exp.Expression: + """ + Convert BETWEEN expressions to >= AND <= comparisons. + + StarRocks Limitation (DUPLICATE/UNIQUE/AGGREGATE KEY Tables): + BETWEEN is not supported in DELETE WHERE clauses for non-PRIMARY KEY tables. + + PRIMARY KEY tables support BETWEEN, but this conversion is safe for all table types + since the converted form (>= AND <=) is semantically equivalent. + + This method converts: + - col BETWEEN a AND b → col >= a AND col <= b + + Args: + expression: The expression potentially containing BETWEEN + + Returns: + Expression with BETWEEN converted to comparisons + """ + + def transform(node: exp.Expression) -> exp.Expression: + if isinstance(node, exp.Between): + # Extract components: col BETWEEN low AND high + column = node.this # The column being tested + low = node.args.get("low") # Lower bound + high = node.args.get("high") # Upper bound + + if column and low and high: + # Build: column >= low AND column <= high + gte = exp.GTE(this=column.copy(), expression=low.copy()) + lte = exp.LTE(this=column.copy(), expression=high.copy()) + return exp.And(this=gte, expression=lte) + + return node + + # Transform the expression tree + return expression.transform(transform, copy=True) + + def execute( + self, + expressions: t.Union[str, exp.Expression, t.Sequence[exp.Expression]], + ignore_unsupported_errors: bool = False, + quote_identifiers: bool = True, + track_rows_processed: bool = False, + **kwargs: t.Any, + ) -> None: + """ + Override execute to strip FOR UPDATE from queries (not supported in StarRocks). + + StarRocks is an OLAP database and does not support row-level locking via + SELECT ... FOR UPDATE. This method removes lock expressions before execution. + + Args: + expressions: SQL expression(s) to execute + ignore_unsupported_errors: Whether to ignore unsupported errors + quote_identifiers: Whether to quote identifiers + track_rows_processed: Whether to track rows processed + **kwargs: Additional arguments + """ + from sqlglot.helper import ensure_list + + if isinstance(expressions, str): + super().execute( + expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + return + + # Process expressions to remove FOR UPDATE + processed_expressions: t.List[exp.Expression] = [] + for e in ensure_list(expressions): + if not isinstance(e, exp.Expression): + super().execute( + expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + return + + # Remove lock (FOR UPDATE) from SELECT statements + if isinstance(e, exp.Select) and e.args.get("locks"): + e = e.copy() + e.set("locks", None) + logger.warning( + f"[StarRocks] Removed FOR UPDATE from SELECT statement: " + f"{e.sql(dialect=self.dialect, identify=quote_identifiers)}" + ) + processed_expressions.append(e) + + # Call parent execute with processed expressions + super().execute( + processed_expressions, + ignore_unsupported_errors=ignore_unsupported_errors, + quote_identifiers=quote_identifiers, + track_rows_processed=track_rows_processed, + **kwargs, + ) + + # ==================== Table Creation (CORE IMPLEMENTATION) ==================== + def _create_table_from_columns( + self, + table_name: TableName, + target_columns_to_types: t.Dict[str, exp.DataType], + primary_key: t.Optional[t.Tuple[str, ...]] = None, + exists: bool = True, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + **kwargs: t.Any, + ) -> None: + """ + Create a table using column definitions. + + Unified Model Parameter vs Physical Properties Handling: + For properties that can be defined both as model parameters and in physical_properties, + this method implements a unified priority strategy: + 1. Model parameter takes priority if present + 2. Otherwise, use value from physical_properties + 3. Ensure at most one definition exists + + Supported unified properties: + - primary_key: Model parameter OR physical_properties.primary_key + - partitioned_by: Model parameter OR physical_properties.partitioned_by/partition_by + - clustered_by: Model parameter OR physical_properties.clustered_by/order_by + + Other key types (duplicate_key, aggregate_key, unique_key) only support physical_properties. + + StarRocks Key Column Ordering Constraint: + ALL key types (PRIMARY KEY, UNIQUE KEY, DUPLICATE KEY, AGGREGATE KEY) require: + - Key columns MUST be the first N columns in CREATE TABLE + - Column order MUST match the KEY clause order + + Implementation Strategy: + 1. Normalize model parameters into table_properties with priority handling + 2. Extract and validate key columns from unified table_properties + 3. Validate no conflicts between different key types + 4. Reorder columns to place key columns first + 5. For PRIMARY KEY: Pass to base class (sets SUPPORTS_INDEXES=True) + 6. For other keys: Handle in _build_table_key_property + + Args: + table_name: Fully qualified table name + target_columns_to_types: Column definitions {name: DataType} + primary_key: Primary key column names (model parameter, takes priority) + exists: Add IF NOT EXISTS clause + table_description: Table comment + column_descriptions: Column comments {column_name: comment} + kwargs: Additional properties including: + - partitioned_by: Partition columns (model parameter) + - clustered_by: Clustering columns (model parameter) + - table_properties: Physical properties dict + + Example: + # Model parameter (priority): + partitioned_by=dt, + clustered_by=(dt, id)) + physical_properties( + primary_key=(id, dt) + ) + + # Or physical_properties only: + physical_properties( + duplicate_key=(id, dt), + partitioned_by=dt, + order_by=(dt, id) + ) + """ + # Use setdefault to simplify table_properties access + table_properties = kwargs.setdefault("table_properties", {}) + + # Extract and validate key columns from table_properties + # Priority: parameter primary_key > table_properties (already handled above) + key_type, key_columns = self._extract_and_validate_key_columns( + table_properties, primary_key + ) + # logger.debug( + # "_create_table_from_columns: extracted key_type=%s, key_columns=%s", + # key_type, + # key_columns, + # ) + + # IMPORTANT: Normalize parameter primary_key into table_properties for unified handling + # This ensures _build_table_properties_exp() can access primary_key even when + # it's passed as a model parameter rather than in physical_properties + if primary_key: + table_properties["primary_key"] = primary_key + logger.debug("_create_table_from_columns: unified primary_key into table_properties") + elif key_type: + # logger.debug( + # "table key type '%s' may be handled in _build_table_key_property", key_type + # ) + pass + + # StarRocks key column ordering constraint: All key types need reordering + if key_columns: + target_columns_to_types = self._reorder_columns_for_key( + target_columns_to_types, key_columns, key_type or "key" + ) + + # IMPORTANT: Do NOT pass primary_key to base class! + # Unlike other databases, StarRocks requires PRIMARY KEY to be in POST_SCHEMA location + # (in properties section after columns), not inside schema (inside column definitions). + # We handle ALL key types (including PRIMARY KEY) in _build_table_key_property. + # logger.debug( + # "_create_table_from_columns: NOT passing primary_key to base class (handled in _build_table_key_property)" + # ) + super()._create_table_from_columns( + table_name=table_name, + target_columns_to_types=target_columns_to_types, + primary_key=None, # StarRocks handles PRIMARY KEY in properties, not schema + exists=exists, + table_description=table_description, + column_descriptions=column_descriptions, + **kwargs, + ) + + # ==================== View / Materialized View ==================== + def create_view( + self, + view_name: TableName, + query_or_df: QueryOrDF, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + replace: bool = True, + materialized: bool = False, + materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + source_columns: t.Optional[t.List[str]] = None, + **create_kwargs: t.Any, + ) -> None: + """ + StarRocks behavior: + - Regular VIEW: supports CREATE OR REPLACE (base behavior) + - MATERIALIZED VIEW: does NOT support CREATE OR REPLACE, so replace=True => DROP + CREATE + """ + if not materialized: + return super().create_view( + view_name=view_name, + query_or_df=query_or_df, + target_columns_to_types=target_columns_to_types, + replace=replace, + materialized=False, + materialized_properties=materialized_properties, + table_description=table_description, + column_descriptions=column_descriptions, + view_properties=view_properties, + source_columns=source_columns, + **create_kwargs, + ) + + # MATERIALIZED VIEW path + if replace: + # Avoid DROP MATERIALIZED VIEW failure when an object with the same name exists but is not an MV. + self.drop_data_object_on_type_mismatch( + self.get_data_object(view_name), DataObjectType.MATERIALIZED_VIEW + ) + self.drop_view(view_name, ignore_if_not_exists=True, materialized=True) + # logger.debug( + # f"Creating materialized view: {view_name}, materialized: {materialized}, " + # f"materialized_properties: {materialized_properties}, " + # f"view_properties: {view_properties}, create_kwargs: {create_kwargs}, " + # ) + + return self._create_materialized_view( + view_name=view_name, + query_or_df=query_or_df, + target_columns_to_types=target_columns_to_types, + materialized_properties=materialized_properties, + table_description=table_description, + column_descriptions=column_descriptions, + view_properties=view_properties, + source_columns=source_columns, + **create_kwargs, + ) + + def _create_materialized_view( + self, + view_name: TableName, + query_or_df: QueryOrDF, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + materialized_properties: t.Optional[t.Dict[str, t.Any]] = None, + table_description: t.Optional[str] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + source_columns: t.Optional[t.List[str]] = None, + **create_kwargs: t.Any, + ) -> None: + """ + Create a StarRocks materialized view. + + StarRocks MV schema supports a column list but does NOT support explicit data types in that list. + We therefore build a schema with column names + optional COMMENT only. + """ + import pandas as pd + + query_or_df = self._native_df_to_pandas_df(query_or_df) + + if isinstance(query_or_df, pd.DataFrame): + values: t.List[t.Tuple[t.Any, ...]] = list( + query_or_df.itertuples(index=False, name=None) + ) + target_columns_to_types, source_columns = self._columns_to_types( + query_or_df, target_columns_to_types, source_columns + ) + if not target_columns_to_types: + raise SQLMeshError("columns_to_types must be provided for dataframes") + source_columns_to_types = get_source_columns_to_types( + target_columns_to_types, source_columns + ) + query_or_df = self._values_to_sql( + values, + source_columns_to_types, + batch_start=0, + batch_end=len(values), + ) + + source_queries, target_columns_to_types = self._get_source_queries_and_columns_to_types( + query_or_df, + target_columns_to_types, + batch_size=0, + target_table=view_name, + source_columns=source_columns, + ) + if len(source_queries) != 1: + raise SQLMeshError("Only one source query is supported for creating materialized views") + + target_table = exp.to_table(view_name) + schema: t.Union[exp.Table, exp.Schema] = self._build_materialized_view_schema_exp( + target_table, + target_columns_to_types=target_columns_to_types, + column_descriptions=column_descriptions, + ) + + # Pass model materialized properties through the existing properties builder + partitioned_by = None + clustered_by = None + partition_interval_unit = None + if materialized_properties: + partitioned_by = materialized_properties.get("partitioned_by") + clustered_by = materialized_properties.get("clustered_by") + partition_interval_unit = materialized_properties.get("partition_interval_unit") + # logger.debug( + # f"Get info from materialized_properties: {materialized_properties}, " + # f"partitioned_by: {partitioned_by}, " + # f"clustered_by: {clustered_by}, " + # f"partition_interval_unit: {partition_interval_unit}" + # ) + + properties_exp = self._build_table_properties_exp( + catalog_name=target_table.catalog, + table_properties=view_properties, + target_columns_to_types=target_columns_to_types, + table_description=table_description, + partitioned_by=partitioned_by, + clustered_by=clustered_by, + partition_interval_unit=partition_interval_unit, + table_kind="MATERIALIZED_VIEW", + ) + + with source_queries[0] as query: + self.execute( + exp.Create( + this=schema, + kind="VIEW", + replace=False, + expression=query, + properties=properties_exp, + **create_kwargs, + ), + quote_identifiers=self.QUOTE_IDENTIFIERS_IN_VIEWS, + ) + + self._clear_data_object_cache(view_name) + + def _build_materialized_view_schema_exp( + self, + table: exp.Table, + *, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + column_descriptions: t.Optional[t.Dict[str, str]] = None, + ) -> t.Union[exp.Table, exp.Schema]: + """ + Build a StarRocks MV schema with column names + optional COMMENT only (no types). + """ + columns: t.List[str] = [] + if target_columns_to_types: + columns = list(target_columns_to_types) + elif column_descriptions: + columns = list(column_descriptions) + + if not columns: + return table + + column_descriptions = column_descriptions or {} + expressions: t.List[exp.Expression] = [] + for col in columns: + constraints: t.List[exp.ColumnConstraint] = [] + comment = column_descriptions.get(col) + if comment: + constraints.append( + exp.ColumnConstraint( + kind=exp.CommentColumnConstraint( + this=exp.Literal.string(self._truncate_column_comment(comment)) + ) + ) + ) + expressions.append( + exp.ColumnDef( + this=exp.to_identifier(col), + constraints=constraints, + ) + ) + + return exp.Schema(this=table, expressions=expressions) + + # ==================== Table Properties Builder (for Table and MV/VIew) ==================== + def _build_table_properties_exp( + self, + catalog_name: t.Optional[str] = None, + table_format: t.Optional[str] = None, + storage_format: t.Optional[str] = None, + partitioned_by: t.Optional[t.List[exp.Expression]] = None, + partition_interval_unit: t.Optional[IntervalUnit] = None, + clustered_by: t.Optional[t.List[exp.Expression]] = None, + table_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + table_description: t.Optional[str] = None, + table_kind: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[exp.Properties]: + """ + Build table properties for StarRocks CREATE TABLE statement. + + Unified Model Parameter vs Physical Properties Handling: + This method receives both model parameters (partitioned_by, clustered_by) and + physical_properties (table_properties dict). Priority is handled as follows: + + 1. primary_key / partitioned_by / clustered_by (ORDER BY) + - Model parameter takes priority + - Falls back to physical_properties.xxx + - Handled in _build_partition_property + + 2. special for primary_key: + - Still need to be processed in _build_table_key_property + + 3. Other key types (duplicate_key, unique_key, aggregate_key): + - Only available via physical_properties + - Handled in _build_table_key_property + + Handles: + - Key constraints (PRIMARY KEY, DUPLICATE KEY, UNIQUE KEY) + - Partition expressions (RANGE/LIST/EXPRESSION) + - Distribution (HASH/RANDOM) + - Order by (clustering) + - Table comment + - Other properties (replication_num, storage_medium, etc.) + + Args: + partitioned_by: Partition columns/expression from model parameter (takes priority) + clustered_by: Clustering columns from model parameter (takes priority) + table_properties: Dictionary containing physical_properties: + - primary_key/duplicate_key/unique_key/aggregate_key: Tuple/list of column names + - partitioned_by(partition_by): Partition definition (fallback) + - distributed_by: Tuple of EQ expressions (kind, expressions, buckets) or string + - clustered_by(order_by): Clustering definition (fallback) + - replication_num, storage_medium, etc.: Literal values + table_description: Table comment + """ + properties: t.List[exp.Expression] = [] + table_properties_copy = dict(table_properties) if table_properties else {} + # logger.debug( + # "_build_table_properties_exp: table_properties=%s", + # table_properties.keys() if table_properties else [], + # ) + + is_mv = table_kind == "MATERIALIZED_VIEW" + if is_mv: + # Required for CREATE MATERIALIZED VIEW (SQLGlot uses this property to switch the keyword) + properties.append(exp.MaterializedProperty()) + + # Validate all property names at once + PropertyValidator.check_all_invalid_names(table_properties_copy) + + # Check for mutually exclusive key types + # Note: primary_key is already set into table_properties if model param is set + active_key_type = PropertyValidator.check_at_most_one( + property_name="key_type", + property_description="key type", + table_properties=table_properties_copy, + ) + if is_mv and active_key_type: + raise SQLMeshError( + f"You can't specify the table type when the table is a materialized view. " + f"Current specified key type '{active_key_type}'." + ) + + # 0. Extract key columns for partition/distribution validation (read-only, don't pop yet) + key_type, key_columns = None, None + if active_key_type: + key_type = active_key_type + key_expr = table_properties_copy[key_type] + # Use validate_and_normalize_property to get List[exp.Column], then extract names + normalized = PropertyValidator.validate_and_normalize_property( + key_type, key_expr, preprocess_parentheses=True + ) + key_columns = tuple(col.name for col in normalized) + + # 1. Handle key constraints (ALL types including PRIMARY KEY) + key_prop = self._build_table_key_property(table_properties_copy, active_key_type) + if key_prop: + properties.append(key_prop) + + # 2. Add table comment (it must be ahead of other properties except the talbe key/type) + if table_description: + properties.append( + exp.SchemaCommentProperty( + this=exp.Literal.string(self._truncate_table_comment(table_description)) + ) + ) + + # 3. Handle partitioned_by (PARTITION BY RANGE/LIST/EXPRESSION) + partition_prop = self._build_partition_property( + partitioned_by, + partition_interval_unit, + target_columns_to_types, + catalog_name, + table_properties_copy, + key_type, + key_columns, + ) + if partition_prop: + properties.append(partition_prop) + + # 4. Handle distributed_by (DISTRIBUTED BY HASH/RANDOM) + distributed_prop = self._build_distributed_by_property(table_properties_copy, key_columns) + if distributed_prop: + properties.append(distributed_prop) + + # 5. Handle refresh_property (REFRESH ...) + if is_mv: + refresh_prop = self._build_refresh_property(table_properties_copy) + if refresh_prop: + properties.append(refresh_prop) + + # 6. Handle order_by/clustered_by (ORDER BY ...) + order_prop = self._build_order_by_property(table_properties_copy, clustered_by or None) + if order_prop: + properties.append(order_prop) + + # 5. Handle other properties (replication_num, storage_medium, etc.) + other_props = self._build_other_properties(table_properties_copy) + properties.extend(other_props) + + return exp.Properties(expressions=properties) if properties else None + + def _build_view_properties_exp( + self, + view_properties: t.Optional[t.Dict[str, exp.Expression]] = None, + table_description: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[exp.Properties]: + """ + Build CREATE VIEW properties for StarRocks. + + Supports StarRocks view SECURITY syntax: SECURITY {NONE | INVOKER} + via exp.SecurityProperty (renders as `SECURITY `). + """ + properties: t.List[exp.Expression] = [] + + if table_description: + properties.append( + exp.SchemaCommentProperty( + this=exp.Literal.string(self._truncate_table_comment(table_description)) + ) + ) + + if view_properties: + view_properties_copy = dict(view_properties) + security = view_properties_copy.pop("security", None) + if security is not None: + security_text = PropertyValidator.validate_and_normalize_property( + "security", security + ) + # exp.SecurityProperty renders as `SECURITY ` (no '=') + properties.append(exp.SecurityProperty(this=exp.Var(this=security_text))) + + properties.extend(self._table_or_view_properties_to_expressions(view_properties_copy)) + + if properties: + return exp.Properties(expressions=properties) + return None + + def _build_table_key_property( + self, table_properties: t.Dict[str, t.Any], active_key_type: t.Optional[str] + ) -> t.Optional[exp.Expression]: + """ + Build key constraint property for ALL key types including PRIMARY KEY. + + Unlike other databases where PRIMARY KEY is handled by base class in schema, + StarRocks requires ALL key types (PRIMARY KEY, DUPLICATE KEY, UNIQUE KEY, AGGREGATE KEY) + to be in POST_SCHEMA location (properties section after columns). + + Handles: + - PRIMARY KEY + - DUPLICATE KEY + - UNIQUE KEY + - AGGREGATE KEY (when implemented) + + Args: + table_properties: Dictionary containing key definitions (will be modified) + active_key_type: The active key type or None + + Returns: + Key property expression for the active key type, or None + """ + if not active_key_type: + return None + + # Configuration: key_name -> Property class (excluding primary_key) + KEY_PROPERTY_CLASSES: t.Dict[str, t.Type[exp.Expression]] = { + "primary_key": exp.PrimaryKey, + "duplicate_key": exp.DuplicateKeyProperty, + "unique_key": exp.UniqueKeyProperty, + # "aggregate_key": exp.AggregateKeyProperty, # Not implemented yet + } + + property_class = KEY_PROPERTY_CLASSES.get(active_key_type) + key_value = table_properties.pop(active_key_type, None) + if not property_class: + # Aggregate key requires special handling + if active_key_type == "aggregate_key": + raise SQLMeshError( + "AGGREGATE KEY tables are not currently supported. " + "AGGREGATE KEY requires specifying aggregation functions (SUM/MAX/MIN/REPLACE) " + "for value columns, which is not supported in the current model configuration syntax. " + "Please use PRIMARY KEY, UNIQUE KEY, or DUPLICATE KEY instead." + ) + # Unknown key type + logger.warning(f"[StarRocks] Unknown key type: {active_key_type}") + return None + if key_value is None: + logger.error(f"Failed to get the parameter value for {active_key_type!r}") + return None + + logger.debug( + "_build_table_key_property: input key=%s value=%s", + active_key_type, + key_value, + ) + + # Validate and normalize + # preprocess_parentheses=True handles string preprocessing like 'id, dt' -> '(id, dt)' + normalized = PropertyValidator.validate_and_normalize_property( + active_key_type, key_value, preprocess_parentheses=True + ) + # normalized is List[exp.Column] as defined in TableKeyInputSpec + result = property_class(expressions=list(normalized)) + return result + + def _build_partition_property( + self, + partitioned_by: t.Optional[t.List[exp.Expression]], + partition_interval_unit: t.Optional["IntervalUnit"], + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]], + catalog_name: t.Optional[str], + table_properties: t.Dict[str, t.Any], + key_type: t.Optional[str], + key_columns: t.Optional[t.Tuple[str, ...]], + ) -> t.Optional[exp.Expression]: + """ + Build partition property expression. + + StarRocks supports: + - PARTITION BY RANGE (cols) - for time-based partitions + - PARTITION BY LIST (cols) - for categorical partitions + - PARTITION BY (exprs) - for expression partitions, can also be `exprs` (without `(`, and `)`) + + Args: + partitioned_by: Partition column expressions from parameter + partition_interval_unit: Optional time unit for automatic partitioning + target_columns_to_types: Column definitions + catalog_name: Catalog name (if applicable) + table_properties: Dictionary containing partitioned_by/partitions (will be modified) + key_type: Table key type (for validation) + key_columns: Table key columns (partition columns must be subset) + + Returns: + Partition property expression or None + """ + # Priority: parameter > partition_by (alias) > partitioned_by + # Use PropertyValidator to check mutual exclusion between parameter and properties + partition_param_name = PropertyValidator.check_at_most_one( + property_name="partitioned_by", + property_description="partition definition", + table_properties=table_properties, + parameter_value=partitioned_by or None, + ) + + # If parameter was provided, it takes priority + if not partitioned_by and partition_param_name: + # Get from table_properties + partitioned_by = table_properties.pop(partition_param_name, None) + if not partitioned_by: + return None + + # Parse partition expressions to extract columns and kind (RANGE/LIST) + partition_kind, partition_cols = self._parse_partition_expressions(partitioned_by) + logger.debug( + "_build_partition_property: partition_kind=%s, partition_cols=%s", + partition_kind, + partition_cols, + ) + + def extract_column_name(expr: exp.Expression) -> t.Optional[str]: + if isinstance(expr, exp.Column): + return str(expr.name) + elif isinstance(expr, (exp.Anonymous, exp.Func)): # noqa: RET505 + return None # not implemented + else: + return str(expr) + + # Validate partition columns are in key columns (StarRocks requirement) + if key_columns: + partition_col_names = set(extract_column_name(expr) for expr in partition_cols) - {None} + key_cols_set = set(key_columns) + not_in_key = partition_col_names - key_cols_set + if not_in_key: + logger.warning( + f"[StarRocks] Partition columns {not_in_key} not in {key_type} columns {key_cols_set}. " + "StarRocks requires partition columns to be part of the table key." + ) + + # Get partition definitions (RANGE/LIST partitions) + # Note: Expression-based partitioning (partition_kind=None) does not support pre-created partitions + if partitions := table_properties.pop("partitions", None): + if partition_kind is None: + logger.warning( + "[StarRocks] 'partitions' parameter is ignored for expression-based partitioning. " + "Expression partitioning creates partitions automatically and does not support " + "pre-created partition definitions." + ) + partitions = None # Ignore partitions for expression-based partitioning + else: + partitions = PropertyValidator.validate_and_normalize_property( + "partitions", partitions + ) + + # Build partition expression using base class method + result = self._build_partitioned_by_exp( + partition_cols, + partition_interval_unit=partition_interval_unit, + target_columns_to_types=target_columns_to_types, + catalog_name=catalog_name, + partitions=partitions, + partition_kind=partition_kind, + ) + return result + + def _parse_partition_expressions( + self, partitioned_by: t.List[exp.Expression] + ) -> t.Tuple[t.Optional[str], t.List[exp.Expression]]: + """ + Parse partition expressions and extract partition kind (RANGE/LIST). + + Uses PartitionedByInputSpec to validate and normalize the entire list, + then extracts RANGE/LIST kind from function expressions. + + The SPEC output is List[exp.Column | exp.Anonymous | exp.Func], where: + - exp.Column: Regular column reference + - exp.Anonymous: Function call like RANGE(col), LIST(col), and other datetime related functions + - exp.Func: date_trunc(), and other built-in functions + + Args: + partitioned_by: List of partition expressions + + Returns: + Tuple of (partition_kind, normalized_columns) + - partition_kind: "RANGE", "LIST", or None + - normalized_columns: List of Column expressions, or function expressions + """ + parsed_cols: t.List[exp.Expression] = [] + partition_kind: t.Optional[str] = None + + normalized = PropertyValidator.validate_and_normalize_property( + "partitioned_by", partitioned_by, preprocess_parentheses=True + ) + # Process each normalized expression + for norm_expr in normalized: + # Check if it's a RANGE function (exp.Anonymous) + if isinstance(norm_expr, exp.Anonymous) and norm_expr.this: + func_name = str(norm_expr.this).upper() + if func_name in ("RANGE", "LIST"): + partition_kind = func_name + # Extract column expressions from function arguments + for arg in norm_expr.expressions: + if isinstance(arg, exp.Column): + parsed_cols.append(arg) + else: + parsed_cols.append(exp.to_column(str(arg))) + continue + + # Check if it's a LIST expression (SQLGlot parses LIST(...) as exp.List) + if isinstance(norm_expr, exp.List): + partition_kind = "LIST" + # Extract column expressions from list items + for item in norm_expr.expressions: + if isinstance(item, exp.Column): + parsed_cols.append(item) + else: + parsed_cols.append(exp.to_column(str(item))) + continue + + # Regular column or other function (date_trunc, etc.) + parsed_cols.append(norm_expr) + + return partition_kind, parsed_cols + + def _build_partitioned_by_exp( + self, + partitioned_by: t.List[exp.Expression], + *, + partition_interval_unit: t.Optional["IntervalUnit"] = None, + target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, + catalog_name: t.Optional[str] = None, + **kwargs: t.Any, + ) -> t.Optional[ + t.Union[ + exp.PartitionedByProperty, + exp.PartitionByRangeProperty, + exp.PartitionByListProperty, + exp.Property, + ] + ]: + """ + Build StarRocks partitioning expression. + + - partition_kind: RANGE/LIST/None (passed via kwargs, None as expression partitioning) + - partitioned_by: normalized partition column/func/anonymous expressions + - partitions: partition definitions as List[str] (passed via kwargs) + + Supports both RANGE and LIST partition syntaxes, and expression partition syntax. + + Args: + partitioned_by: List of partition column expressions + partition_interval_unit: Optional time unit (unused for now) + target_columns_to_types: Column definitions (unused for now) + catalog_name: Catalog name (unused for now) + **kwargs: Must contain 'partition_kind' and optionally 'partitions' + + Returns: + PartitionByRangeProperty, PartitionByListProperty, or None + """ + partition_kind = kwargs.get("partition_kind") + partitions: t.Optional[t.List[str]] = kwargs.get("partitions") + + # Process partitions to create_expressions + # partitions is already List[str] after SPEC normalization + create_expressions: t.Optional[t.List[exp.Var]] = None + if partitions: + create_expressions = [exp.Var(this=p, quoted=False) for p in partitions] + + # Build partition expression + if partition_kind == "LIST": + return exp.PartitionByListProperty( + partition_expressions=partitioned_by, + create_expressions=create_expressions, + ) + elif partition_kind == "RANGE": # noqa: RET505 + return exp.PartitionByRangeProperty( + partition_expressions=partitioned_by, + create_expressions=create_expressions, + ) + elif partition_kind is None: + return exp.PartitionedByProperty(this=exp.tuple_(*partitioned_by)) + + return None + + def _build_distributed_by_property( + self, + table_properties: t.Dict[str, t.Any], + key_columns: t.Optional[t.Tuple[str, ...]], + ) -> t.Optional[exp.DistributedByProperty]: + """ + Build DISTRIBUTED BY property from table_properties. + + Supports: + 1. Structured tuple: (kind='HASH', columns=(id, dt), buckets=10) + 2. String format: "HASH(id)", "RANDOM", "HASH(id) BUCKETS 10" + 3. None: Returns None (no default distribution) + + For complex string like "HASH(id) BUCKETS 10", uses split-and-combine: + - Split on 'BUCKETS' to separate HASH part and bucket count + - Parse HASH part via DistributedByInputSpec + - Parse bucket count as number + - Combine into unified dict + + Args: + table_properties: Dictionary containing distributed_by (will be modified) + key_columns: Table key columns (used for default distribution) + + Returns: + DistributedByProperty or None + """ + distributed_by = table_properties.pop("distributed_by", None) + + # No default - if not set, return None + if distributed_by is None: + return None + + # Try to parse complex string with BUCKETS first + unified = self._parse_distribution_with_buckets(distributed_by) + if unified is None: + # Fall back to SPEC-based parsing + normalized = PropertyValidator.validate_and_normalize_property( + "distributed_by", distributed_by + ) + # Convert to unified dict format + unified = DistributionTupleOutputType.to_unified_dict(normalized) + + logger.debug( + "_build_distributed_by_property: normalized to kind=%s, columns=%s, buckets=%s", + unified.get("kind"), + unified.get("columns"), + unified.get("buckets"), + ) + + # Build expression + kind_expr = exp.Var(this=unified["kind"]) + # Convert columns to expressions + columns: t.List[exp.Column] = unified.get("columns", []) + expressions_list: t.List[exp.Expression] = [] + for col in columns: + if isinstance(col, exp.Expression): + expressions_list.append(col) + else: + expressions_list.append(exp.to_column(str(col))) + # Build buckets expression + buckets: t.Optional[t.Any] = unified.get("buckets") + if buckets is not None: + if isinstance(buckets, exp.Literal): + buckets_expr = buckets + else: + buckets_expr = exp.Literal.number(int(buckets)) + else: + buckets_expr = None + + result = exp.DistributedByProperty( + kind=kind_expr, + expressions=expressions_list, + buckets=buckets_expr, + order=None, + ) + return result + + def _build_refresh_property( + self, table_properties: t.Dict[str, t.Any] + ) -> t.Optional[exp.RefreshTriggerProperty]: + """ + Build StarRocks MV REFRESH clause as exp.RefreshTriggerProperty. + + Input (from physical_properties): + - refresh_moment: IMMEDIATE | DEFERRED (optional) + - refresh_scheme: MANUAL | ASYNC [START ()] EVERY (INTERVAL ) (optional) + + Output mapping (to match sqlglot StarRocks generator refreshtriggerproperty_sql): + - method: refresh_moment when provided; otherwise a sentinel that won't render + - kind: ASYNC | MANUAL + - starts/every/unit: parsed from refresh_scheme if present + """ + refresh_moment = table_properties.pop("refresh_moment", None) + refresh_scheme = table_properties.pop("refresh_scheme", None) + if refresh_moment is None and refresh_scheme is None: + return None + + # method is required by exp.RefreshTriggerProperty, but StarRocks syntax does NOT support AUTO. + # We use a sentinel value that the StarRocks generator will not render (it only renders + # IMMEDIATE/DEFERRED). + method_expr = exp.Var(this="UNSPECIFIED") + if refresh_moment is not None: + refresh_moment_text = PropertyValidator.validate_and_normalize_property( + "refresh_moment", refresh_moment + ) + method_expr = exp.Var(this=refresh_moment_text) + + kind_expr: t.Optional[exp.Expression] = None + starts_expr: t.Optional[exp.Expression] = None + every_expr: t.Optional[exp.Expression] = None + unit_expr: t.Optional[exp.Expression] = None + + if refresh_scheme is not None: + scheme_text = PropertyValidator.validate_and_normalize_property( + "refresh_scheme", refresh_scheme + ) + if isinstance(scheme_text, exp.Var): + kind_expr = scheme_text + else: + kind_expr, starts_expr, every_expr, unit_expr = self._parse_refresh_scheme( + scheme_text + ) + + return exp.RefreshTriggerProperty( + method=method_expr, + kind=kind_expr, + starts=starts_expr, + every=every_expr, + unit=unit_expr, + ) + + def _parse_refresh_scheme( + self, refresh_scheme: str + ) -> t.Tuple[ + t.Optional[exp.Expression], + t.Optional[exp.Expression], + t.Optional[exp.Expression], + t.Optional[exp.Expression], + ]: + """ + Parse StarRocks refresh_scheme text into (kind, starts, every, unit). + + parsing simple and robust. We only extract: + - kind: ASYNC | MANUAL (must appear at the beginning), None if not provided + - starts: START () where is treated as a raw string + - every/unit: EVERY (INTERVAL ) + """ + text = (refresh_scheme or "").strip() + if not text: + return None, None, None, None + + m_kind = re.match(r"^(MANUAL|ASYNC)\b", text, flags=re.IGNORECASE) + if not m_kind: + raise SQLMeshError( + f"[StarRocks] Invalid refresh_scheme {refresh_scheme!r}. Expected to start with MANUAL or ASYNC." + ) + kind = m_kind.group(1).upper() + kind_expr: t.Optional[exp.Expression] = exp.Var(this=kind) + + starts_expr: t.Optional[exp.Expression] = None + every_expr: t.Optional[exp.Expression] = None + unit_expr: t.Optional[exp.Expression] = None + m_start = re.search( + r"\bSTART\s*\(\s*(?:'([^']*)'|\"([^\"]*)\"|([^)]*))\s*\)", text, flags=re.IGNORECASE + ) + if m_start: + start_inner = (m_start.group(1) or m_start.group(2) or m_start.group(3) or "").strip() + starts_expr = exp.Literal.string(start_inner) + m_every = re.search( + r"\bEVERY\s*\(\s*INTERVAL\s+(\d+)\s+(\w+)\s*\)", text, flags=re.IGNORECASE + ) + if m_every: + every_expr = exp.Literal.number(int(m_every.group(1))) + unit_expr = exp.Var(this=m_every.group(2).upper()) + return kind_expr, starts_expr, every_expr, unit_expr + + def _parse_distribution_with_buckets( + self, distributed_by: t.Any + ) -> t.Optional[t.Dict[str, t.Any]]: + """ + Parse complex distribution expressions like 'HASH(id) BUCKETS 10'. + + Since SQLGlot cannot parse 'HASH(id) BUCKETS 10' directly, we: + 1. Detect if input is a string containing 'BUCKETS' + 2. Split into HASH part and BUCKETS part + 3. Parse HASH part via DistributedByInputSpec + 4. Extract bucket count as number + 5. Combine into unified dict + + Args: + distributed_by: The distribution value (may be string, expression, etc.) + + Returns: + Unified dict with keys: kind, columns, buckets + Returns None if not a complex BUCKETS expression + (The output function will still handle "HASH(id)" without BUCKETS) + """ + # Only handle string or Literal string values + if isinstance(distributed_by, str): + text = distributed_by + elif isinstance(distributed_by, exp.Literal) and distributed_by.is_string: + text = str(distributed_by.this) + else: + return None + + # Check if contains BUCKETS keyword (case-insensitive) + if "BUCKETS" not in text.upper(): + return None + + # Split on BUCKETS (case-insensitive) + match = re.match(r"^(.+?)\s+BUCKETS\s+(\d+)\s*$", text.strip(), flags=re.IGNORECASE) + if not match: + return None + + hash_part = match.group(1).strip() + buckets_str = match.group(2) + + # Parse the HASH/RANDOM part via SPEC + normalized = PropertyValidator.validate_and_normalize_property("distributed_by", hash_part) + + return DistributionTupleOutputType.to_unified_dict(normalized, int(buckets_str)) + + def _build_order_by_property( + self, + table_properties: t.Dict[str, t.Any], + clustered_by: t.Optional[t.List[exp.Expression]], + ) -> t.Optional[exp.Cluster]: + """ + Build ORDER BY (clustering) property. + + Supports both: + - clustered_by parameter (from create_table call) + - order_by in table_properties (backward compatibility alias) + + Priority: clustered_by parameter > order_by in table_properties + + Args: + table_properties: Dictionary containing optional order_by (will be modified) + clustered_by: Clustering columns from parameter + + Returns: + Cluster expression (generates ORDER BY) or None + """ + # Priority: clustered_by parameter > order_by in table_properties + # Use PropertyValidator to check mutual exclusion between parameter and property + order_by_param_name = PropertyValidator.check_at_most_one( + property_name="clustered_by", + property_description="clustering definition", + table_properties=table_properties, + parameter_value=clustered_by, + ) + + # If parameter was provided, it takes priority + if clustered_by is None and order_by_param_name: + # Get order_by from table_properties (already validated by check_at_most_one) + order_by = table_properties.pop(order_by_param_name, None) + if order_by is not None: + normalized = PropertyValidator.validate_and_normalize_property( + "clustered_by", order_by, preprocess_parentheses=True + ) + clustered_by = list(normalized) + + if clustered_by: + result = exp.Cluster(expressions=clustered_by) + return result + else: # noqa: RET505 + return None + + def _build_other_properties(self, table_properties: t.Dict[str, t.Any]) -> t.List[exp.Property]: + """ + Build other literal properties (replication_num, storage_medium, etc.). + + Uses validate_and_normalize_property for validation and ensures output is string, + as StarRocks PROPERTIES syntax requires all values to be strings. + + Args: + table_properties: Dictionary containing properties (will be modified) + + Returns: + List of Property expressions + """ + other_props = [] + + for key, value in list(table_properties.items()): + # Skip special keys handled elsewhere + if key in PropertyValidator.IMPORTANT_PROPERTY_NAMES: + logger.warning(f"[StarRocks] {key!r} should have been processed already, skipping") + continue + + # Remove from properties + table_properties.pop(key) + + # Validate and normalize to string + # All other properties are treated as generic string properties + try: + normalized = PropertyValidator.validate_and_normalize_property(key, value) + other_props.append( + exp.Property( + this=exp.to_identifier(key), + value=exp.Literal.string(str(normalized)), + ) + ) + except SQLMeshError as e: + logger.warning("[StarRocks] skipping property %s due to error: %s", key, e) + + return other_props + + def _extract_and_validate_key_columns( + self, + table_properties: t.Dict[str, t.Any], + primary_key: t.Optional[t.Tuple[str, ...]] = None, + ) -> t.Tuple[t.Optional[str], t.Optional[t.Tuple[str, ...]]]: + """ + Extract and validate key columns from table_properties. + + All key types require: + - Key columns must be the first N columns in CREATE TABLE + - Column order must match the KEY clause order + + Priority: + - Parameter primary_key > table_properties primary_key + - Only one key type allowed per table + + Args: + table_properties: Table properties dictionary (lowercase keys expected) + primary_key: Primary key from method parameter (highest priority) + + Returns: + Tuple of (key_type, key_columns) + - key_type: One of 'primary_key', 'unique_key', 'duplicate_key', 'aggregate_key', None + - key_columns: Tuple of column names, or None + + Raises: + SQLMeshError: If multiple key types are defined or column extraction fails + """ + # Use PropertyValidator to check mutual exclusion + active_key_type = PropertyValidator.check_at_most_one( + property_name="key_type", # dummy + property_description="table key type", + table_properties=table_properties, + parameter_value=primary_key, + ) + + # If parameter primary_key was provided, return it + if primary_key: + return ("primary_key", primary_key) + + # Extract from table_properties + if not active_key_type: + return (None, None) + + # Get the key expression and normalize via SPEC + key_expr = table_properties[active_key_type] # Read without popping + # Use validate_and_normalize_property to get List[exp.Column], then extract names + normalized = PropertyValidator.validate_and_normalize_property( + active_key_type, key_expr, preprocess_parentheses=True + ) + key_columns = tuple(col.name for col in normalized) + + return (active_key_type, key_columns) + + def _reorder_columns_for_key( + self, + target_columns_to_types: t.Dict[str, exp.DataType], + key_columns: t.Tuple[str, ...], + key_type: str = "key", + ) -> t.Dict[str, exp.DataType]: + """ + Reorder columns to place key columns first. + + StarRocks Constraint (ALL Table Types): + Key columns (PRIMARY/UNIQUE/DUPLICATE/AGGREGATE) MUST be the first N columns + in the CREATE TABLE statement, in the same order as defined in the KEY clause. + + Example: + Input: + columns = {"customer_id": INT, "order_id": BIGINT, "event_date": DATE} + key_columns = ("order_id", "event_date") + key_type = "primary_key" + + Output: + {"order_id": BIGINT, "event_date": DATE, "customer_id": INT} + + Args: + target_columns_to_types: Original column order (from SELECT) + key_columns: Key column names in desired order + key_type: Type of key for logging (primary_key, unique_key, etc.) + + Returns: + Reordered columns with key columns first + + Raises: + SQLMeshError: If a key column is not found in target_columns_to_types + """ + # Validate that all key columns exist + missing_key_cols = set(key_columns) - set(target_columns_to_types.keys()) + if missing_key_cols: + raise SQLMeshError( + f"{key_type} columns {missing_key_cols} not found in table columns. " + f"Available columns: {list(target_columns_to_types.keys())}" + ) + + # Build new ordered dict: key columns first, then remaining columns + reordered = {} + + # 1. Add key columns in key order + for key_col in key_columns: + reordered[key_col] = target_columns_to_types[key_col] + + # 2. Add remaining columns (preserve original order) + for col_name, col_type in target_columns_to_types.items(): + if col_name not in key_columns: + reordered[col_name] = col_type + + logger.info( + f"Reordered columns for {key_type.upper()}: " + f"Original order: {list(target_columns_to_types.keys())}, " + f"New order: {list(reordered.keys())}" + ) + + return reordered + + def _build_create_comment_table_exp( + self, table: exp.Table, table_comment: str, table_kind: str = "TABLE" + ) -> str: + """ + Build ALTER TABLE COMMENT SQL for table comment modification. + + StarRocks uses non-standard syntax for table comments: + ALTER TABLE {table} COMMENT = '{comment}' + + Note: This method is typically NOT called for StarRocks because: + - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS + - Comments are included directly in CREATE TABLE via SchemaCommentProperty + + However, this override is provided for potential future use cases: + - Modifying comments on existing tables via ALTER TABLE + - View comments (if COMMENT_CREATION_VIEW changes) + + Args: + table: Table expression + table_comment: The comment to add + table_kind: Type of object (TABLE, VIEW, etc.) + + Returns: + SQL string for ALTER TABLE COMMENT + """ + table_sql = table.sql(dialect=self.dialect, identify=True) + comment_sql = exp.Literal.string(self._truncate_table_comment(table_comment)).sql( + dialect=self.dialect + ) + return f"ALTER TABLE {table_sql} COMMENT = {comment_sql}" + + def _build_create_comment_column_exp( + self, + table: exp.Table, + column_name: str, + column_comment: str, + table_kind: str = "TABLE", + ) -> str: + """ + Build ALTER TABLE MODIFY COLUMN SQL for column comment modification. + + StarRocks requires column type in MODIFY COLUMN statement: + ALTER TABLE {table} MODIFY COLUMN {column} {type} COMMENT '{comment}' + + Note: This method is typically NOT called for StarRocks because: + - COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS + - Column comments are included directly in CREATE TABLE DDL + + However, this override is provided for potential future use cases: + - Modifying column comments on existing tables via ALTER TABLE + + Args: + table: Table expression + column_name: Name of the column + column_comment: The comment to add + table_kind: Type of object (TABLE, VIEW, etc.) + + Returns: + SQL string for ALTER TABLE MODIFY COLUMN with COMMENT + """ + table_sql = table.sql(dialect=self.dialect, identify=True) + column_sql = exp.to_identifier(column_name).sql(dialect=self.dialect, identify=True) + + comment_sql = exp.Literal.string(self._truncate_column_comment(column_comment)).sql( + dialect=self.dialect + ) + + return f"ALTER TABLE {table_sql} MODIFY COLUMN {column_sql} COMMENT {comment_sql}" + + # ==================== Methods NOT Needing Override (Base Class Works) ==================== + # The following methods work correctly with base class implementation: + # - columns(): Query column definitions via DESCRIBE TABLE + # - table_exists(): Check if table exists via information_schema + # - insert_append(): Standard INSERT INTO ... SELECT + # - insert_overwrite_by_time_partition(): Uses DELETE_INSERT strategy (handled by base) + # - fetchall() / fetchone(): Standard query execution + # - execute(): Base SQL execution. (Modifyed for `FOR UPDATE` lock operation only) + # - create_table_properties(): Delegate to _build_table_properties_exp() diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index 1808011854..dcc1e750bb 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -2026,6 +2026,40 @@ def run_post_statements(self, snapshot: Snapshot, render_kwargs: t.Any) -> None: self.adapter.execute(snapshot.model.render_post_statements(**render_kwargs)) +def _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model: Model, physical_properties: t.Optional[t.Dict[str, t.Any]] +) -> t.Dict[str, t.Any]: + """ + Promote StarRocks incremental-by-unique-key models to PRIMARY KEY tables so that + complex DELETE/MERGE statements remain supported. + """ + + properties = dict(physical_properties or {}) + + if ( + model.dialect != "starrocks" + or not model.kind.is_incremental_by_unique_key + or "primary_key" in properties + ): + return properties + unique_key: t.Optional[t.List[exp.Expression]] = model.unique_key + if unique_key: + properties["primary_key"] = ( + unique_key[0] if len(unique_key) == 1 else exp.Tuple(expressions=unique_key) + ) + logger.info( + "Model '%s' promoted to PRIMARY KEY table on StarRocks to support rich DELETE operations.", + model.name, + ) + else: + logger.warning( + f"StarRocks incremental-by-unique-key model '{model.name}' requires a PRIMARY KEY table. " + f"Specify `physical_properties['primary_key']` or set `unique_key` on the model.", + ) + + return properties + + class MaterializableStrategy(PromotableStrategy, abc.ABC): def create( self, @@ -2038,6 +2072,9 @@ def create( ) -> None: ctas_query = model.ctas_query(**render_kwargs) physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties + ) logger.info("Creating table '%s'", table_name) if model.annotated: @@ -2152,6 +2189,10 @@ def _replace_query_for_model( except Exception: columns_to_types, source_columns = None, None + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties + ) self.adapter.replace_query( name, query_or_df, @@ -2160,7 +2201,7 @@ def _replace_query_for_model( partitioned_by=model.partitioned_by, partition_interval_unit=model.partition_interval_unit, clustered_by=model.clustered_by, - table_properties=kwargs.get("physical_properties", model.physical_properties), + table_properties=physical_properties, table_description=model.description, column_descriptions=model.column_descriptions, target_columns_to_types=columns_to_types, @@ -2294,6 +2335,10 @@ def insert( table_name, render_kwargs=render_kwargs, ) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties + ) self.adapter.merge( table_name, query_or_df, @@ -2305,7 +2350,7 @@ def insert( end=kwargs.get("end"), execution_time=kwargs.get("execution_time"), ), - physical_properties=kwargs.get("physical_properties", model.physical_properties), + physical_properties=physical_properties, source_columns=source_columns, ) @@ -2320,6 +2365,10 @@ def append( columns_to_types, source_columns = self._get_target_and_source_columns( model, table_name, render_kwargs=render_kwargs ) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties + ) self.adapter.merge( table_name, query_or_df, @@ -2331,7 +2380,7 @@ def append( end=kwargs.get("end"), execution_time=kwargs.get("execution_time"), ), - physical_properties=kwargs.get("physical_properties", model.physical_properties), + physical_properties=physical_properties, source_columns=source_columns, ) @@ -2674,12 +2723,20 @@ def insert( return logger.info("Replacing view '%s'", table_name) + materialized_properties = None + if is_materialized_view: + materialized_properties = { + "partitioned_by": model.partitioned_by, + "clustered_by": model.clustered_by, + "partition_interval_unit": model.partition_interval_unit, + } self.adapter.create_view( table_name, query_or_df, model.columns_to_types, replace=must_recreate_view, materialized=is_materialized_view, + materialized_properties=materialized_properties, view_properties=kwargs.get("physical_properties", model.physical_properties), table_description=model.description, column_descriptions=model.column_descriptions, @@ -3101,13 +3158,17 @@ def create( if is_table_deployable and is_snapshot_deployable: # We could deploy this to prod; create a proper managed table logger.info("Creating managed table: %s", table_name) + physical_properties = kwargs.get("physical_properties", model.physical_properties) + physical_properties = _ensure_primary_key_for_starrocks_when_incremental_by_unique_key( + model, physical_properties + ) self.adapter.create_managed_table( table_name=table_name, query=model.render_query_or_raise(**render_kwargs), target_columns_to_types=model.columns_to_types, partitioned_by=model.partitioned_by, clustered_by=model.clustered_by, - table_properties=kwargs.get("physical_properties", model.physical_properties), + table_properties=physical_properties, table_description=model.description, column_descriptions=model.column_descriptions, table_format=model.table_format, diff --git a/tests/core/engine_adapter/integration/__init__.py b/tests/core/engine_adapter/integration/__init__.py index 4ad6a17944..4eb3038135 100644 --- a/tests/core/engine_adapter/integration/__init__.py +++ b/tests/core/engine_adapter/integration/__init__.py @@ -77,6 +77,7 @@ def pytest_marks(self) -> t.List[MarkDecorator]: IntegrationTestEngine("spark", native_dataframe_type="pyspark"), IntegrationTestEngine("clickhouse", catalog_types=["standalone", "cluster"]), IntegrationTestEngine("risingwave"), + IntegrationTestEngine("starrocks"), # Cloud engines that need paid accounts / special credentials IntegrationTestEngine("clickhouse_cloud", cloud=True), IntegrationTestEngine("redshift", cloud=True), @@ -265,6 +266,7 @@ def timestamp_columns(self) -> t.List[str]: for k, v in self.columns_to_types.items() if v.sql().lower().startswith("timestamp") or (v.sql().lower() == "datetime" and self.dialect == "bigquery") + or (v.sql().lower() == "datetime" and self.dialect == "starrocks") ] @property @@ -307,6 +309,9 @@ def supports_merge(self) -> bool: if self.dialect == "risingwave": return False + if self.dialect == "starrocks": + return False + return True @property @@ -448,7 +453,7 @@ def get_table_comment( AND pgc.relkind = '{"v" if table_kind == "VIEW" else "r"}' ; """ - elif self.dialect in ["mysql", "snowflake"]: + elif self.dialect in ["mysql", "snowflake", "starrocks"]: # Snowflake treats all identifiers as uppercase unless they are lowercase and quoted. # They are lowercase and quoted in sushi but not in the inline tests. if self.dialect == "snowflake" and snowflake_capitalize_ids: @@ -458,6 +463,7 @@ def get_table_comment( comment_field_name = { "mysql": "table_comment", "snowflake": "comment", + "starrocks": "table_comment", } query = f""" @@ -563,7 +569,7 @@ def get_column_comments( AND pgc.relkind = '{"v" if table_kind == "VIEW" else "r"}' ; """ - elif self.dialect in ["mysql", "snowflake", "trino"]: + elif self.dialect in ["mysql", "snowflake", "trino", "starrocks"]: # Snowflake treats all identifiers as uppercase unless they are lowercase and quoted. # They are lowercase and quoted in sushi but not in the inline tests. if self.dialect == "snowflake" and snowflake_capitalize_ids: @@ -574,6 +580,7 @@ def get_column_comments( "mysql": "column_comment", "snowflake": "comment", "trino": "comment", + "starrocks": "column_comment", } query = f""" diff --git a/tests/core/engine_adapter/integration/config.yaml b/tests/core/engine_adapter/integration/config.yaml index 0b1ecd8193..da3784d2da 100644 --- a/tests/core/engine_adapter/integration/config.yaml +++ b/tests/core/engine_adapter/integration/config.yaml @@ -118,6 +118,16 @@ gateways: host: {{ env_var('DOCKER_HOSTNAME', 'localhost') }} port: 4566 check_import: false + inttest_starrocks: + connection: + type: starrocks + host: {{ env_var('DOCKER_HOSTNAME', 'localhost') }} + port: 9030 + user: root + password: "" + check_import: false + state_connection: + type: duckdb # Cloud databases diff --git a/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml new file mode 100644 index 0000000000..3a19fa6a3f --- /dev/null +++ b/tests/core/engine_adapter/integration/docker/compose.starrocks.yaml @@ -0,0 +1,27 @@ +services: + starrocks-fe: + image: starrocks/fe-ubuntu:3.5-latest + container_name: starrocks-fe + hostname: starrocks-fe + environment: + - FE_SERVERS=fe1:starrocks-fe:9030 + ports: + - "9030:9030" # MySQL protocol port for tests + - "8030:8030" # HTTP port + networks: + - starrocks_net + + starrocks-be: + image: starrocks/be-ubuntu:3.5-latest + container_name: starrocks-be + hostname: starrocks-be + depends_on: + - starrocks-fe + environment: + - FE_SERVERS=starrocks-fe:9030 + networks: + - starrocks_net + +networks: + starrocks_net: + driver: bridge diff --git a/tests/core/engine_adapter/integration/test_integration.py b/tests/core/engine_adapter/integration/test_integration.py index 1fba346db3..86b54d7399 100644 --- a/tests/core/engine_adapter/integration/test_integration.py +++ b/tests/core/engine_adapter/integration/test_integration.py @@ -777,6 +777,8 @@ def test_insert_overwrite_by_time_partition(ctx_query_and_df: TestContext): ds_type = "datetime" if ctx.dialect == "tsql": ds_type = "varchar(max)" + if ctx.dialect == "starrocks": + ds_type = "datetime" ctx.columns_to_types = {"id": "int", "ds": ds_type} table = ctx.table("test_table") @@ -865,6 +867,8 @@ def test_insert_overwrite_by_time_partition_source_columns(ctx_query_and_df: Tes ds_type = "datetime" if ctx.dialect == "tsql": ds_type = "varchar(max)" + if ctx.dialect == "starrocks": + ds_type = "datetime" ctx.columns_to_types = {"id": "int", "ds": ds_type} columns_to_types = { @@ -2579,6 +2583,7 @@ def test_dialects(ctx: TestContext): "mysql": pd.Timestamp("2020-01-01 00:00:00"), "spark": pd.Timestamp("2020-01-01 00:00:00"), "databricks": pd.Timestamp("2020-01-01 00:00:00"), + "starrocks": pd.Timestamp("2020-01-01 00:00:00"), }, ), ( diff --git a/tests/core/engine_adapter/integration/test_integration_starrocks.py b/tests/core/engine_adapter/integration/test_integration_starrocks.py new file mode 100644 index 0000000000..eee2b2054b --- /dev/null +++ b/tests/core/engine_adapter/integration/test_integration_starrocks.py @@ -0,0 +1,2454 @@ +""" +Integration tests for StarRocks Engine Adapter + +These tests require a running StarRocks instance. +They verify that the generated SQL actually works on real StarRocks database. + +Strategy: +- Basic test: Verify fundamental functionality works +- Complex test: Verify comprehensive SQL with all features works + +Run with: + pytest -m "starrocks and docker" tests/core/engine_adapter/integration/test_integration_starrocks.py + +Or against local StarRocks: + export STARROCKS_HOST=localhost + export STARROCKS_PORT=9030 + export STARROCKS_USER=root + export STARROCKS_PASSWORD="" + pytest tests/core/engine_adapter/integration/test_integration_starrocks.py +""" + +import logging +import os +import re +import typing as t +from functools import partial + +import pytest +from sqlglot import exp + +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter +from sqlmesh.core.model.definition import load_sql_based_model, SqlModel +import sqlmesh.core.dialect as d + +from tests.core.engine_adapter.integration import TestContext + +# Mark as docker test (can also run against local StarRocks) +# Remove 'docker' marker if you want to run against local instance only +pytestmark = [pytest.mark.starrocks, pytest.mark.docker, pytest.mark.engine] + + +logger = logging.getLogger(__name__) + + +def _load_sql_model(model_sql: str) -> SqlModel: + expressions = d.parse(model_sql, default_dialect="starrocks") + return t.cast(SqlModel, load_sql_based_model(expressions)) + + +def _materialized_properties_from_model(model: SqlModel) -> t.Optional[t.Dict[str, t.Any]]: + props: t.Dict[str, t.Any] = {} + if model.partitioned_by: + props["partitioned_by"] = model.partitioned_by + if model.clustered_by: + props["clustered_by"] = model.clustered_by + return props or None + + +def _model_name_from_table(table: exp.Table) -> str: + if table.db: + return f"{table.db}.{table.name}" + return table.name + + +def normalize_sql(sql: str) -> str: + """Normalizes a SQL string for comparison.""" + # Remove comments + sql = re.sub(r"--.*\n", "", sql) + # Replace newlines and tabs with spaces + sql = sql.replace("\n", " ").replace("\t", "") + # Collapse multiple spaces into one + sql = re.sub(r"\s+", " ", sql) + # Remove spaces around parentheses, commas, and equals for consistency + sql = re.sub(r"\s*\(\s*", "(", sql) + sql = re.sub(r"\s*\)\s*", ")", sql) + sql = re.sub(r"\s*,\s*", ",", sql) + sql = re.sub(r"\s*=\s*", "=", sql) + # Remove all paired backticks around identifiers + sql = re.sub(r"`([^`]+)`", r"\1", sql) + sql = re.sub(r"\'", '"', sql) + + return sql.strip() + + +Row = t.Tuple[t.Any, ...] + + +def expect_row(row: t.Optional[Row]) -> Row: + assert row is not None + return row + + +def fetchone_or_fail(adapter: StarRocksEngineAdapter, query: t.Any) -> Row: + return expect_row(adapter.fetchone(query)) + + +# ============================================================================= +# TestContext-based Integration Tests +# ============================================================================= +# +# These tests demonstrate how to use SQLMesh's TestContext helpers in a StarRocks-specific +# integration file: +# - Automatic schema isolation via ctx.test_id +# - Automatic cleanup of created schemas +# +# Unlike the shared integration harness (which loads the full gateway config), this local +# fixture keeps StarRocks tests self-contained and runnable with only StarRocks deps installed. + + +@pytest.fixture(scope="module") +def starrocks_connection_config() -> t.Dict[str, t.Any]: + """StarRocks connection configuration from environment variables.""" + return { + "host": os.getenv("STARROCKS_HOST", "localhost"), + "port": int(os.getenv("STARROCKS_PORT", "9030")), + "user": os.getenv("STARROCKS_USER", "root"), + "password": os.getenv("STARROCKS_PASSWORD", ""), + } + + +@pytest.fixture +def ctx(tmp_path, starrocks_connection_config) -> t.Generator[TestContext, None, None]: + """ + A lightweight TestContext fixture which avoids loading the full integration gateway config. + + This keeps the StarRocks integration tests self-contained (similar to `starrocks_adapter`) + while still providing TestContext niceties like: + - ctx.table(...) naming + schema isolation + - automatic cleanup + """ + from pymysql import connect + + adapter = StarRocksEngineAdapter(partial(connect, **starrocks_connection_config)) + ctx = TestContext( + "query", + adapter, + mark="starrocks", + gateway="manual_starrocks", + tmp_path=tmp_path, + is_remote=False, + ) + + ctx.init() + try: + with ctx.engine_adapter.session({}): + yield ctx + finally: + ctx.cleanup() + + +@pytest.fixture +def engine_adapter(ctx: TestContext) -> StarRocksEngineAdapter: + assert isinstance(ctx.engine_adapter, StarRocksEngineAdapter) + return ctx.engine_adapter + + +@pytest.fixture(scope="module") +def starrocks_adapter( + starrocks_connection_config, +) -> t.Generator[StarRocksEngineAdapter, None, None]: + """Create a real StarRocks adapter connected to database. + It's still used in a lot of tests, so it can't be removed yet. + """ + from pymysql import connect + + connection_factory = partial(connect, **starrocks_connection_config) + adapter = StarRocksEngineAdapter(connection_factory) + + yield adapter + + # Cleanup: adapter will auto-close connection + + +@pytest.fixture(scope="module", autouse=True) +def init_test_integration_env(starrocks_adapter: StarRocksEngineAdapter) -> None: + """ + Auto-adjust default_replication_num for small shared-nothing clusters. + + If run_mode is shared_nothing and available backends < 3, set default_replication_num = 1 + to prevent replica-creation failures in tests. + """ + + def _get_config_value(name: str) -> t.Optional[str]: + try: + row = starrocks_adapter.fetchone(f"ADMIN SHOW FRONTEND CONFIG LIKE '{name}'") + except Exception as e: # pragma: no cover - defensive for older SR versions + logger.warning("Skipping config lookup %s: %s", name, e) + return None + if not row or len(row) < 3: + logger.warning("Unexpected result for %s: %s", name, row) + return None + return str(row[2]).strip() + + run_mode = _get_config_value("run_mode") + if not run_mode or run_mode.lower() != "shared_nothing": + return + + try: + backends = starrocks_adapter.fetchall("SHOW BACKENDS") + except Exception as e: # pragma: no cover - defensive for older SR versions + logger.warning("Skipping backend count check: %s", e) + return + + be_count = len(backends) + if be_count >= 3: + return + + current_replication = _get_config_value("default_replication_num") + try: + current_replication_int = int(current_replication) if current_replication is not None else None + except Exception: + current_replication_int = None + + if current_replication_int is not None and current_replication_int <= be_count: + return + + try: + starrocks_adapter.execute('ADMIN SET FRONTEND CONFIG ("default_replication_num" = "1")') + logger.info( + "Set default_replication_num=1 for shared_nothing cluster with %s backends (was %s)", + be_count, + current_replication, + ) + except Exception as e: # pragma: no cover - do not break tests if lacking privilege + logger.warning( + "Failed to set default_replication_num for shared_nothing cluster: %s", e + ) + + +class TestBasicOperations: + """ + Basic Operations + + Each test method verifies one fundamental SQL operation. + This allows running individual tests and clear failure reporting. + """ + + def test_create_drop_schema(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE DATABASE and DROP DATABASE (TestContext version).""" + db_name = ctx.schema("sr_test_create_drop_db") + + # CREATE DATABASE + engine_adapter.create_schema(db_name, ignore_if_exists=True) + result = fetchone_or_fail( + engine_adapter, + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'", + ) + assert result[0] == db_name + + # DROP DATABASE + engine_adapter.drop_schema(db_name) + result: t.Optional[Row] = engine_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{db_name}'" + ) + assert result is None, "DROP DATABASE failed" + + def test_create_drop_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE TABLE and DROP TABLE (TestContext version).""" + table = ctx.table("sr_test_table") + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + db_name = table.db + table_name = table.name + exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + ) + assert exists is not None, "CREATE TABLE failed" + + engine_adapter.drop_table(table) + exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'" + ) + assert exists is None, "DROP TABLE failed" + + def test_create_table_like_preserves_metadata_and_copies_no_data( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ) -> None: + """ + Verify StarRocks native CREATE TABLE LIKE semantics: + - Copies schema (columns) + - Does NOT copy data + - Preserves key table metadata (at least PRIMARY KEY / DISTRIBUTED BY) + """ + source = ctx.table("src_like") + target = ctx.table("tgt_like") + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("BIGINT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + primary_key=("id",), + table_properties={ + # Make metadata visible in SHOW CREATE TABLE so LIKE preservation is testable. + "distributed_by": "HASH(id) BUCKETS 10", + "replication_num": "1", + }, + ) + + engine_adapter.execute( + f"INSERT INTO {source.sql(dialect=ctx.dialect, identify=True)} (id, name) " + "VALUES (1, 'a'), (2, 'b')" + ) + + engine_adapter.create_table_like(target, source, exists=True) + + # Like should not copy data. + src_count = fetchone_or_fail( + engine_adapter, f"SELECT COUNT(*) FROM {source.sql(dialect=ctx.dialect, identify=True)}" + )[0] + tgt_count = fetchone_or_fail( + engine_adapter, f"SELECT COUNT(*) FROM {target.sql(dialect=ctx.dialect, identify=True)}" + )[0] + assert src_count == 2 + assert tgt_count == 0 + + # Like should preserve key metadata (engine-defined behavior). + ddl = fetchone_or_fail( + engine_adapter, f"SHOW CREATE TABLE {target.sql(dialect=ctx.dialect, identify=True)}" + )[1] + ddl_upper = ddl.upper() + assert "PRIMARY KEY" in ddl_upper + assert "DISTRIBUTED BY" in ddl_upper + + def test_create_table_like_exists_false_raises( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ) -> None: + """If exists=False and target already exists, StarRocks should error.""" + source = ctx.table("src_like_exists") + target = ctx.table("tgt_like_exists") + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + }, + primary_key=("id",), + table_properties={"replication_num": "1"}, + ) + engine_adapter.create_table_like(target, source, exists=True) + + with pytest.raises(Exception): + engine_adapter.create_table_like(target, source, exists=False) + + def test_delete(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test DELETE operation (TestContext version).""" + table = ctx.table("sr_test_table") + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute( + f"INSERT INTO {table_sql} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + + engine_adapter.delete_from(table, "id = 2") + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "DELETE failed" + + def test_rename_table(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test RENAME TABLE operation (TestContext version).""" + old_table = ctx.table("old_table") + new_table = ctx.table("new_table") + + old_table_sql = old_table.sql(dialect=ctx.dialect, identify=True) + new_table_sql = new_table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + old_table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + engine_adapter.execute(f"INSERT INTO {old_table_sql} (id, name) VALUES (1, 'Test')") + engine_adapter.rename_table(old_table, new_table) + + db_name = old_table.db + old_table_name = old_table.name + new_table_name = new_table.name + + old_exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{old_table_name}'" + ) + assert old_exists is None, "Old table should not exist after rename" + + new_exists = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{new_table_name}'" + ) + assert new_exists is not None, "New table should exist after rename" + + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {new_table_sql}") + assert count[0] == 1, "Data should be preserved after rename" + + def test_create_index(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE INDEX operation (skipped for StarRocks) (TestContext version).""" + table = ctx.table("sr_test_table") + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + # CREATE INDEX (should be skipped silently) + engine_adapter.create_index(table, "idx_name", ("name",)) + + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] >= 0, "Table should still be functional after skipped index creation" + + def test_create_drop_view(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """Test CREATE VIEW and DROP VIEW (TestContext version).""" + table = ctx.table("sr_test_table") + view = ctx.table("sr_test_view") + + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + + query = exp.select(exp.column("id"), exp.column("name")).from_(table) + engine_adapter.create_view(view, query) + + db_name = view.db + view_name = view.name + result = fetchone_or_fail( + engine_adapter, + f"SELECT TABLE_NAME FROM information_schema.VIEWS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'", + ) + assert result, "CREATE VIEW failed" + + engine_adapter.drop_view(view) + result_optional: t.Optional[Row] = engine_adapter.fetchone( + f"SELECT TABLE_NAME FROM information_schema.VIEWS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{view_name}'" + ) + assert result_optional is None, "DROP VIEW failed" + + +class TestViewAndMaterializedViewFeatures: + """Integration tests for StarRocks view SECURITY and MV property combos.""" + + def test_create_view_with_security( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_sec_src") + view = ctx.table("sr_sec_view") + source_sql_ident = source.sql(dialect=ctx.dialect, identify=True) + view_sql_ident = view.sql(dialect=ctx.dialect, identify=True) + view_model_name = _model_name_from_table(view) + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute( + f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + + model_sql = f""" + MODEL ( + name {view_model_name}, + kind VIEW, + dialect starrocks, + columns ( + id INT, + name VARCHAR(100) + ), + virtual_properties ( + security = invoker + ) + ); + SELECT id, name FROM {source_sql_ident}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + engine_adapter.create_view( + view, + query, + replace=True, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE VIEW {view_sql_ident}")[1] + assert "SECURITY INVOKER" in ddl.upper() + + def test_create_view_replace_flag( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_replace_src") + view = ctx.table("sr_replace_view") + source_sql_ident = source.sql(dialect=ctx.dialect, identify=True) + view_model_name = _model_name_from_table(view) + + engine_adapter.create_table( + source, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + ) + engine_adapter.execute(f"INSERT INTO {source_sql_ident} (id, name) VALUES (1, 'A')") + + model_sql = f""" + MODEL ( + name {view_model_name}, + kind VIEW, + dialect starrocks, + columns (id INT, name VARCHAR(100)) + ); + SELECT id, name FROM {source_sql_ident}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + + # Success with replace=True to replace the old one + engine_adapter.create_view( + view, + query, + replace=True, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + # Failed to create a view when it's existing + with pytest.raises(Exception): + engine_adapter.create_view( + view, + query, + replace=False, + target_columns_to_types=model.columns_to_types, + view_properties=model.virtual_properties, + ) + + def _create_sales_source_table( + self, + ctx: TestContext, + engine_adapter: StarRocksEngineAdapter, + table: exp.Table, + ) -> str: + table_sql = table.sql(dialect=ctx.dialect, identify=True) + engine_adapter.create_table( + table, + target_columns_to_types={ + "order_id": exp.DataType.build("BIGINT"), + "customer_id": exp.DataType.build("INT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(18,2)"), + "region": exp.DataType.build("VARCHAR(50)"), + }, + primary_key=("order_id", "event_date"), + partitioned_by="event_date", + ) + engine_adapter.execute( + f""" + INSERT INTO {table_sql} (order_id, customer_id, event_date, amount, region) + VALUES + (1, 1001, '2024-01-01', 10.50, 'us'), + (2, 1002, '2024-01-02', 20.75, 'eu') + """ + ) + return table_sql + + def test_materialized_view_combo_with_materialized_properties( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_mv_combo_a_src") + mv = ctx.table("sr_mv_combo_a") + mv_sql = mv.sql(dialect=ctx.dialect, identify=True) + source_sql = source.sql(dialect=ctx.dialect, identify=True) + mv_model_name = _model_name_from_table(mv) + + self._create_sales_source_table(ctx, engine_adapter, source) + + model_sql = f""" + MODEL ( + name {mv_model_name}, + kind VIEW ( + materialized true + ), + dialect starrocks, + description 'MV combo A description', + columns ( + order_id BIGINT, + customer_id INT, + event_date DATE, + amount DECIMAL(18,2), + region VARCHAR(50) + ), + column_descriptions ( + order_id = 'Order identifier', + customer_id = 'Customer identifier' + ), + partitioned_by (event_date), + clustered_by (customer_id, region), + virtual_properties ( + distributed_by = 'HASH(order_id) BUCKETS 8', + refresh_moment = DEFERRED, + refresh_scheme = 'ASYNC START (''2025-01-01 00:00:00'') EVERY (INTERVAL 5 MINUTE)', + replication_num = '1' + ) + ); + SELECT order_id, customer_id, event_date, amount, region + FROM {source_sql}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + engine_adapter.create_view( + mv, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.virtual_properties, + table_description=model.description, + column_descriptions=model.column_descriptions, + ) + + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + logger.debug(f"mv ddl: {ddl}") + ddl_upper = normalize_sql(ddl).upper() + assert "REFRESH DEFERRED ASYNC" in ddl_upper + assert ( + "START('2025-01-01 00:00:00')EVERY(INTERVAL 5 MINUTE)" in ddl_upper + or 'START("2025-01-01 00:00:00")EVERY(INTERVAL 5 MINUTE)' in ddl_upper + ) + assert "PARTITION BY(EVENT_DATE)" in ddl_upper + assert "ORDER BY(CUSTOMER_ID,REGION)" in ddl_upper + assert "DISTRIBUTED BY HASH(ORDER_ID)BUCKETS 8" in ddl_upper + assert ( + "COMMENT 'MV COMBO A DESCRIPTION'" in ddl_upper + or 'COMMENT "MV COMBO A DESCRIPTION"' in ddl_upper + ) + + def test_materialized_view_combo_all_properties_block( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + source = ctx.table("sr_mv_combo_b_src") + mv = ctx.table("sr_mv_combo_b") + mv_sql = mv.sql(dialect=ctx.dialect, identify=True) + source_sql = source.sql(dialect=ctx.dialect, identify=True) + mv_model_name = _model_name_from_table(mv) + + self._create_sales_source_table(ctx, engine_adapter, source) + + model_sql = f""" + MODEL ( + name {mv_model_name}, + kind VIEW ( + materialized true + ), + dialect starrocks, + description 'Analytics MV combo B', + columns ( + order_id BIGINT, + customer_id INT, + event_date DATE, + amount DECIMAL(18,2) + ), + column_descriptions ( + amount = 'Order amount' + ), + virtual_properties ( + partition_by = event_date, + -- ignored when MV + partitions = ( + 'PARTITION p202401 VALUES LESS THAN ("2024-02-01")', + 'PARTITION p202402 VALUES LESS THAN ("2024-03-01")' + ), + distributed_by = (kind=HASH, expressions=(order_id, customer_id), buckets=4), + order_by = (order_id, event_date), + refresh_scheme = MANUAL, + replication_num = '1' + ) + ); + SELECT order_id, customer_id, event_date, amount + FROM {source_sql}; + """ + model = _load_sql_model(model_sql) + query = model.render_query() + assert query is not None + materialized_properties = _materialized_properties_from_model(model) + + engine_adapter.create_view( + mv, + query, + replace=True, + materialized=True, + target_columns_to_types=model.columns_to_types, + materialized_properties=materialized_properties, + view_properties=model.virtual_properties, + table_description=model.description, + column_descriptions=model.column_descriptions, + ) + + ddl = fetchone_or_fail(engine_adapter, f"SHOW CREATE MATERIALIZED VIEW {mv_sql}")[1] + ddl_upper = normalize_sql(ddl).upper() + assert "REFRESH MANUAL" in ddl_upper + assert "PARTITION P202401" not in ddl_upper # ignored when MV + assert "PARTITION P202402" not in ddl_upper # ignored when MV + assert "PARTITION BY(EVENT_DATE)" in ddl_upper + assert "ORDER BY(ORDER_ID,EVENT_DATE)" in ddl_upper + assert "DISTRIBUTED BY HASH(ORDER_ID,CUSTOMER_ID)BUCKETS 4" in ddl_upper + assert ( + "COMMENT 'ANALYTICS MV COMBO B'" in ddl_upper + or 'COMMENT "ANALYTICS MV COMBO B"' in ddl_upper + ) + + +class TestTableFeatures: + """ + Table Features + + Each test method verifies one CREATE TABLE feature that is NOT covered by E2E tests. + Focus on independent functionality like comments and data type compatibility. + """ + + def test_table_and_column_comments( + self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter + ): + """Test table and column comments.""" + table = ctx.table("sr_comment_table") + db_name = table.db + table_name = table.name + + # CREATE TABLE with comments + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + table_description="Test table comment", + column_descriptions={ + "id": "User ID", + "name": "User name", + }, + ) + + # Verify table comment + result = fetchone_or_fail( + engine_adapter, + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}'", + ) + assert result[0] == "Test table comment", "Table comment not set" + + # Verify column comments + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + column_comments = {row[0]: row[1] for row in columns} + assert column_comments["id"] == "User ID" + assert column_comments["name"] == "User name" + + def test_multiple_data_types(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """ + Test basic data types support. + + Covers: numeric, string, datetime, boolean, and JSON types with precision. + Reference: https://docs.starrocks.io/docs/sql-reference/data-types/ + """ + table = ctx.table("sr_types_table") + db_name = table.db + table_name = table.name + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + # CREATE TABLE with multiple data types + engine_adapter.create_table( + table, + target_columns_to_types={ + # Numeric types + "col_tinyint": exp.DataType.build("TINYINT"), + "col_smallint": exp.DataType.build("SMALLINT"), + "col_int": exp.DataType.build("INT"), + "col_bigint": exp.DataType.build("BIGINT"), + "col_float": exp.DataType.build("FLOAT"), + "col_double": exp.DataType.build("DOUBLE"), + "col_decimal": exp.DataType.build("DECIMAL(18,2)"), + # String types with precision + "col_char": exp.DataType.build("CHAR(10)"), + "col_varchar": exp.DataType.build("VARCHAR(200)"), + "col_string": exp.DataType.build("STRING"), + # Date/Time types + "col_date": exp.DataType.build("DATE"), + "col_datetime": exp.DataType.build("DATETIME"), + # Boolean and JSON + "col_boolean": exp.DataType.build("BOOLEAN"), + "col_json": exp.DataType.build("JSON"), + }, + ) + + # Verify all columns created with correct types + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + assert len(columns) == 14, f"Expected 14 columns, got {len(columns)}" + + # Test data insertion with various types + engine_adapter.execute( + f""" + INSERT INTO {table_sql} + (col_tinyint, col_smallint, col_int, col_bigint, col_float, col_double, col_decimal, + col_char, col_varchar, col_string, col_date, col_datetime, col_boolean, col_json) + VALUES + (127, 32767, 2147483647, 9223372036854775807, 3.14, 3.141592653589793, 12345.67, + 'test', 'test varchar', 'test string', '2024-01-01', '2024-01-01 12:00:00', + true, '{{"key": "value"}}') + """ + ) + + # Verify insertion + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "Data insertion with basic types failed" + + # Verify data retrieval + result = fetchone_or_fail( + engine_adapter, f"SELECT col_int, col_varchar, col_date FROM {table_sql}" + ) + assert result[0] == 2147483647 + assert result[1] == "test varchar" + + # @pytest.mark.skip(reason="Complex types (ARRAY/MAP/STRUCT) may not be fully supported yet") + def test_complex_data_types(self, ctx: TestContext, engine_adapter: StarRocksEngineAdapter): + """ + Test complex and nested data types support (ARRAY, MAP, STRUCT). + + Covers: + - Simple complex types: ARRAY, MAP, STRUCT + - Nested ARRAY: ARRAY> + - Nested MAP: MAP> + - Nested STRUCT: STRUCT, metadata MAP> + - Mixed nesting: ARRAY> + - Deep nesting: MAP>> + + Note: These types are available in StarRocks 2.5+ but may require additional + configuration or may not be fully supported in the current adapter. + Reference: https://docs.starrocks.io/docs/sql-reference/data-types/ + """ + table = ctx.table("sr_complex_types_table") + db_name = table.db + table_name = table.name + table_sql = table.sql(dialect=ctx.dialect, identify=True) + + # CREATE TABLE with complex and nested data types + engine_adapter.create_table( + table, + target_columns_to_types={ + "id": exp.DataType.build("BIGINT"), + # Simple complex types + "col_array_simple": exp.DataType.build("ARRAY"), + "col_map_simple": exp.DataType.build("MAP"), + "col_struct_simple": exp.DataType.build("STRUCT"), + # Nested ARRAY + "col_array_nested": exp.DataType.build("ARRAY>"), + # Nested MAP (value is ARRAY) + "col_map_nested": exp.DataType.build("MAP>"), + # Nested STRUCT (contains ARRAY and MAP) + "col_struct_nested": exp.DataType.build( + "STRUCT, metadata MAP>" + ), + # ARRAY of STRUCT + "col_array_of_struct": exp.DataType.build("ARRAY>"), + # Deep nesting: MAP with ARRAY of STRUCT + "col_deep_nested": exp.DataType.build( + "MAP>>" + ), + }, + ) + + # Verify all columns created + columns = engine_adapter.fetchall( + f"SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = '{table_name}' " + f"ORDER BY ORDINAL_POSITION" + ) + assert len(columns) == 9, f"Expected 9 columns, got {len(columns)}" + + # Test data insertion with nested types + engine_adapter.execute( + f""" + INSERT INTO {table_sql} + (id, col_array_simple, col_map_simple, col_struct_simple, + col_array_nested, col_map_nested, col_struct_nested, + col_array_of_struct, col_deep_nested) + VALUES ( + 1, + [1,2,3], + map{{'key1':10,'key2':20}}, + row(100,'simple'), + [[1,2],[3,4]], + map{{'arr1':[1,2],'arr2':[3,4]}}, + row(1001, ['tag1','tag2'], map{{'meta1':1,'meta2':2}}), + [row(1,'Alice'), row(2,'Bob')], + map{{'group1':[row(10,'field_a'), row(20,'field_b')]}} + ) + """ + ) + + # Verify insertion + count = fetchone_or_fail(engine_adapter, f"SELECT COUNT(*) FROM {table_sql}") + assert count[0] == 1, "Data insertion with complex nested types failed" + + # Verify data retrieval for simple types + result = fetchone_or_fail( + engine_adapter, f"SELECT col_array_simple, col_struct_simple FROM {table_sql}" + ) + assert result is not None, "Failed to retrieve complex type data" + + +class TestEndToEndModelParsing: + """ + End-to-End Model Parsing Integration Tests + + These tests verify the BASIC and COMPLETE pipeline from MODEL definition to SQL execution. + And will cover some important edge cases, to know whether the whole process can work: + + MODEL Definition (String) + ↓ + d.parse() + load_sql_based_model() + ↓ + Model Object (with physical_properties, partitioned_by_, clustered_by, etc.) + ↓ + adapter.create_table( + partitioned_by=model.partitioned_by_, # MODEL-level parameter + clustered_by=model.clustered_by, # MODEL-level parameter + table_properties=model.physical_properties # From physical_properties block + ) + ↓ + SQL Generation + ↓ + Execute on Real StarRocks + ↓ + Verify via SHOW CREATE TABLE (with ACTUAL column names) + + This ensures that the parameter forms passed to create_table() match + what SQLMesh actually produces when parsing a .sql model file. + + Test Categories: + ================ + + 1. Physical Properties Tests (properties inside physical_properties block): + 2. Model-Level Parameter Tests (parameters at MODEL level, not in physical_properties): + + Property Test Matrix (End-to-End): + +------------------+----------------------------------------+----------------------------------------+ + | Property | MODEL Syntax | Expected DDL | + +------------------+----------------------------------------+----------------------------------------+ + | primary_key | primary_key = (order_id, event_date) | PRIMARY KEY (order_id, event_date) | + | duplicate_key | duplicate_key = (id, name) | DUPLICATE KEY (id, name) | + | partitioned_by | partitioned_by (event_date) | PARTITION BY RANGE (event_date) | + | distributed_by | distributed_by = (kind='HASH', ...) | DISTRIBUTED BY HASH (id) BUCKETS N | + | clustered_by | clustered_by (order_id, region) | ORDER BY (order_id, region) | + | order_by | order_by = (dt, region) | ORDER BY (dt, region) | + | replication_num | replication_num = '1' | PROPERTIES ('replication_num'='1') | + +------------------+----------------------------------------+----------------------------------------+ + """ + + def _parse_model_and_get_all_params(self, model_sql: str) -> t.Dict[str, t.Any]: + """ + Helper: Parse MODEL definition and extract ALL parameters. + + This method returns a dictionary containing ALL parameters that would be passed + to adapter.create_table(), matching what SQLMesh actually does when processing + a model file. This ensures tests verify the real parameter forms, not hand-crafted ones. + + Returns: + Dict containing: + - physical_properties: Dict[str, exp.Expression] from MODEL's physical_properties + - partitioned_by: List[exp.Expression] from MODEL's partitioned_by parameter + - clustered_by: List[exp.Expression] from MODEL's clustered_by parameter + - target_columns_to_types: Dict[str, exp.DataType] from MODEL's columns or query + - table_description: Optional[str] from MODEL's description + - storage_format: Optional[str] from MODEL's storage_format + """ + expressions = d.parse(model_sql, default_dialect="starrocks") + model = load_sql_based_model(expressions, dialect="starrocks") + logger.debug(f"model params: {model}") + + return { + "partitioned_by": model.partitioned_by_, + "clustered_by": model.clustered_by, + "target_columns_to_types": model.columns_to_types or {}, + "table_description": model.description, + "storage_format": model.storage_format, + "table_properties": model.physical_properties, + } + + # ======================================== + # Case 1: Model Parameters (test_design.md Case 1) + # Covers: partitioned_by (multi-expr with function), clustered_by (multi-column) + # ======================================== + + def test_e2e_model_parameters(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test Case 1: Model-level parameters (partitioned_by + clustered_by). + + Covers: partitioned_by (multi-expr with function), clustered_by (multi-column) + """ + db_name = "sr_e2e_model_params_db" + table_name = f"{db_name}.sr_model_params_table" + + model_sql = """ + MODEL ( + name test.model_parameters, + kind FULL, + columns ( + ts BIGINT, + region VARCHAR(50), + order_id BIGINT, + customer_id INT + ), + partitioned_by (from_unixtime(ts), region), -- Multi-expr with function + clustered_by (order_id, customer_id) -- Multi-column + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 1 DDL:\n{ddl}") + + # Precise assertions: verify PARTITION BY RANGE with actual columns + import re + + assert "PARTITION BY " in ddl + # Note: PARTITION BY may contain function expressions like from_unixtime(ts) + # We verify the clause exists and contains expected patterns + part_match = re.search(r"PARTITION BY \s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY clause not found" + part_cols = part_match.group(1) + # Verify function expression and column references + assert ( + # "from_unixtime" in part_cols or "ts" in part_cols + "__generated_partition_column_" in part_cols and "region" in part_cols + ), f"Expected partition expression with generated column/region, got {part_cols}" + + # Verify ORDER BY from clustered_by + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + order_cols = order_match.group(1) + assert "order_id" in order_cols and "customer_id" in order_cols, ( + f"Expected ORDER BY (order_id, customer_id), got {order_cols}" + ) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 2: Physical Properties Core (test_design.md Case 2) + # Covers: primary_key (tuple), distributed_by (string multi-col), order_by (tuple), generic props + # ======================================== + + def test_e2e_physical_properties_core(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test Case 2: Core physical_properties. + + Covers: primary_key (tuple), distributed_by (string multi-col), order_by (tuple), generic props + """ + db_name = "sr_e2e_core_props_db" + table_name = f"{db_name}.sr_core_props_table" + + model_sql = """ + MODEL ( + name test.physical_props_core, + kind FULL, + dialect starrocks, + columns ( + order_id BIGINT, + event_date DATE, + customer_id INT, + region VARCHAR(50), + amount DECIMAL(18,2) + ), + physical_properties ( + primary_key = (order_id, event_date, customer_id, region), + distributed_by = "HASH(customer_id, region) BUCKETS 16", + order_by = (order_id, region), + -- clustered_by = (order_id, region), -- also OK + -- replication_num = '1', + bucket_size = '12345678', + enable_persistent_index = 'true' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 2 DDL:\n{ddl}") + + # Precise assertions + import re + + # Verify PRIMARY KEY with exact columns + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + assert "order_id" in pk_match.group(1) and "event_date" in pk_match.group(1) + + # Verify DISTRIBUTED BY HASH with exact columns + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + dist_cols = dist_match.group(1) + assert "customer_id" in dist_cols and "region" in dist_cols, ( + f"Expected HASH(customer_id, region), got HASH({dist_cols})" + ) + assert "BUCKETS 16" in ddl + + # Verify ORDER BY + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + assert "order_id" in order_match.group(1) and "region" in order_match.group(1) + + # assert "replication_num" not in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 3: String No-Paren Auto-Wrap (test_design.md Case 3) + # Covers: primary_key = "id, dt" auto-conversion + # ======================================== + + def test_e2e_string_no_paren_auto_wrap(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test Case 3: String form without parentheses auto-wrap. + + Covers: primary_key = "id, dt" auto-conversion (multi-column string) + """ + db_name = "sr_e2e_auto_wrap_db" + table_name = f"{db_name}.sr_auto_wrap_table" + + model_sql = """ + MODEL ( + name test.string_no_paren, + kind FULL, + dialect starrocks, + columns ( + order_id BIGINT, + event_date DATE + ), + physical_properties ( + primary_key = "order_id, event_date", -- No parentheses, auto-wrapped + distributed_by = 'HASH(order_id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 3 DDL:\n{ddl}") + + # Precise assertion: verify exact PRIMARY KEY columns + import re + + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + pk_clause = pk_match.group(1) + assert "order_id" in pk_clause and "event_date" in pk_clause, ( + f"Expected both order_id and event_date in PRIMARY KEY, got {pk_clause}" + ) + + # Verify distributed_by with exact columns + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + assert "order_id" in dist_match.group(1), ( + f"Expected HASH(order_id), got HASH({dist_match.group(1)})" + ) + assert "BUCKETS 10" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 4: Structured Distribution (test_design.md Case 4) + # Covers: kind=HASH (unquoted), kind=RANDOM + # ======================================== + + def test_e2e_distribution_structured_hash(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 4A: Structured HASH distribution with unquoted kind.""" + db_name = "sr_e2e_dist_hash_db" + table_name = f"{db_name}.sr_dist_hash_table" + + model_sql = """ + MODEL ( + name test.dist_hash_structured, + kind FULL, + dialect starrocks, + columns ( + customer_id INT, + region VARCHAR(50) + ), + physical_properties ( + distributed_by = (kind=HASH, expressions=(customer_id, region), buckets=16), + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 4A DDL:\n{ddl}") + + # Precise assertions + import re + + assert "DISTRIBUTED BY HASH" in ddl + dist_match = re.search(r"DISTRIBUTED BY HASH\s*\(([^)]+)\)", ddl) + assert dist_match, "DISTRIBUTED BY HASH clause not found" + assert "customer_id" in dist_match.group(1) and "region" in dist_match.group(1) + assert "BUCKETS 16" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_distribution_structured_random(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 4B: Structured RANDOM distribution.""" + db_name = "sr_e2e_dist_random_db" + table_name = f"{db_name}.sr_dist_random_table" + + model_sql = """ + MODEL ( + name test.dist_random_structured, + kind FULL, + dialect starrocks, + columns ( + log_id BIGINT, + event_time DATETIME, + message VARCHAR(500) + ), + physical_properties ( + distributed_by = (kind=RANDOM, buckets=10), + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 4B DDL:\n{ddl}") + + assert "DISTRIBUTED BY RANDOM" in ddl + assert "BUCKETS 10" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 5: Partition with RANGE (test_design.md Case 5) + # Covers: partitioned_by RANGE, partitions tuple + # ======================================== + + def test_e2e_partition_range(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 5: RANGE partition with multiple partition definitions.""" + db_name = "sr_e2e_part_range_db" + table_name = f"{db_name}.sr_part_range_table" + + model_sql = """ + MODEL ( + name test.partition_range, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + year smallint, + month smallint + ), + physical_properties ( + primary_key = (id, year, month), + partition_by = RANGE(year, month), + partitions = ( + 'PARTITION p202401 VALUES LESS THAN ("2024", "02")', + 'PARTITION p202402 VALUES LESS THAN ("2024", "03")', + 'PARTITION p202403 VALUES LESS THAN ("2024", "04")' + ), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 5 DDL:\n{ddl}") + + # Precise assertions + import re + + assert "PARTITION BY RANGE" in ddl + # Verify partition columns + part_match = re.search(r"PARTITION BY RANGE\s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY RANGE clause not found" + assert "year" in part_match.group(1) and "month" in part_match.group(1) + # Verify partition definitions + assert "p202401" in ddl and "p202402" in ddl and "p202403" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 6: Partition with LIST (test_design.md Case 6) + # Covers: LIST partition with partitions values + # ======================================== + + def test_e2e_partition_list(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 6: LIST partition.""" + db_name = "sr_e2e_part_list_db" + table_name = f"{db_name}.sr_part_list_table" + + model_sql = """ + MODEL ( + name test.partition_list, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + region VARCHAR(20) + ), + physical_properties ( + primary_key = (id, region), + partition_by = LIST(region), -- can't use partitioned_by + partitions = ( + 'PARTITION p_cn VALUES IN ("cn", "tw", "hk")', + 'PARTITION p_us VALUES IN ("us", "ca")' + ), + distributed_by = 'HASH(id) BUCKETS 8', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 6 DDL:\n{ddl}") + + # Precise assertions + import re + + assert "PARTITION BY LIST" in ddl + # Verify partition column + part_match = re.search(r"PARTITION BY LIST\s*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY LIST clause not found" + assert "region" in part_match.group(1) + # Verify partition definitions + assert "p_cn" in ddl and "p_us" in ddl + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Case 7: Other Key Types (test_design.md Case 7) + # Covers: duplicate_key, unique_key, aggregate_key + # ======================================== + + def test_e2e_key_type_duplicate(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7A: DUPLICATE KEY.""" + db_name = "sr_e2e_dup_key_db" + table_name = f"{db_name}.sr_dup_key_table" + + model_sql = """ + MODEL ( + name test.duplicate_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + duplicate_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 7A DDL:\n{ddl}") + + # Verify DUPLICATE KEY with exact columns + import re + + dup_match = re.search(r"DUPLICATE KEY\s*\(([^)]+)\)", ddl) + assert dup_match, "DUPLICATE KEY clause not found" + assert "id" in dup_match.group(1) and "dt" in dup_match.group(1), ( + f"Expected DUPLICATE KEY(id, dt), got DUPLICATE KEY({dup_match.group(1)})" + ) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_key_type_unique(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7B: UNIQUE KEY.""" + db_name = "sr_e2e_uniq_key_db" + table_name = f"{db_name}.sr_uniq_key_table" + + model_sql = """ + MODEL ( + name test.unique_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + unique_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Case 7B DDL:\n{ddl}") + + assert "UNIQUE KEY" in ddl, "UNIQUE KEY missing" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_e2e_key_type_aggregate(self, starrocks_adapter: StarRocksEngineAdapter): + """Test Case 7C: AGGREGATE KEY - should raise exception (unsupported).""" + db_name = "sr_e2e_agg_key_db" + table_name = f"{db_name}.sr_agg_key_table" + + model_sql = """ + MODEL ( + name test.aggregate_key_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE + ), + physical_properties ( + aggregate_key = (id, dt), + distributed_by = 'HASH(id) BUCKETS 10', + replication_num = '1' + ) + ); + SELECT * + """ + + from sqlmesh.utils.errors import SQLMeshError + import pytest + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + + # Expect SQLMeshError to be raised for unsupported AGGREGATE KEY + with pytest.raises(SQLMeshError, match="AGGREGATE KEY.*not supported"): + starrocks_adapter.create_table(table_name, **params) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Final: Comprehensive Test (all properties combined) + # ======================================== + + def test_e2e_comprehensive(self, starrocks_adapter: StarRocksEngineAdapter): + """Final: Comprehensive test with ALL property types combined.""" + db_name = "sr_e2e_comprehensive_db" + table_name = f"{db_name}.sr_comprehensive_table" + + model_sql = """ + MODEL ( + name test.comprehensive_model, + kind FULL, + dialect starrocks, + description 'Comprehensive test table with all properties', + columns ( + order_id BIGINT, + event_date DATE, + customer_id INT, + amount DECIMAL(18,2), + status VARCHAR(20) + ), + partitioned_by (event_date), + clustered_by (order_id, event_date), + physical_properties ( + primary_key = (order_id, event_date), + distributed_by = (kind=HASH, expressions=order_id, buckets=8), + replication_num = '1', + storage_medium = 'HDD' + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + params = self._parse_model_and_get_all_params(model_sql) + starrocks_adapter.create_table(table_name, **params) + + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Comprehensive DDL:\n{ddl}") + + # Precise assertions for all major clauses + import re + + # Verify PRIMARY KEY + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + assert "order_id" in pk_match.group(1) and "event_date" in pk_match.group(1) + + # Verify PARTITION BY + assert "PARTITION BY" in ddl + # Verify exact partition column + part_match = re.search(r"PARTITION BY[^(]*\(([^)]+)\)", ddl) + assert part_match, "PARTITION BY clause not found" + part_cols = part_match.group(1) + assert "event_date" in part_cols, ( + f"Expected event_date in PARTITION BY, got {part_cols}" + ) + + # Verify DISTRIBUTED BY + assert "DISTRIBUTED BY HASH" in ddl + assert "BUCKETS 8" in ddl + + # Verify ORDER BY + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + assert "order_id" in order_match.group(1) and "event_date" in order_match.group(1) + + # Verify PROPERTIES + assert "replication_num" in ddl + + # Functional test + starrocks_adapter.execute( + f"INSERT INTO {table_name} " + f"(order_id, event_date, customer_id, amount, status) " + f"VALUES (1001, '2024-01-15', 100, 1234.56, 'completed')" + ) + + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT order_id, customer_id FROM {table_name} WHERE order_id = 1001", + ) + assert result, "INSERT/SELECT failed" + assert result[0] == 1001, "order_id mismatch" + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ======================================== + # Quote Character Handling Test + # Tests single quotes vs double quotes in MODEL parsing + # ======================================== + + def test_e2e_quote_character_handling(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test Case: Quote Character Handling (Single vs Double Quotes). + + This test verifies that MODEL parsing correctly handles different quote types: + - Single quotes 'value' → Literal(is_string=True) ✓ + - Double quotes "value" → Column(quoted=True) (parser quirk, but we handle it) ✓ + - Bare identifiers → proper parsing + + We test this by using different quote forms in MODEL physical_properties + and verifying that the final DDL is correct. + + Quote Behavior: + =============== + In MySQL/StarRocks: + - Backtick ` : identifier quote + - Single quote ': string literal + - Double quote ": string literal (default) OR identifier (ANSI_QUOTES mode) + + In SQLMesh MODEL parsing: + - Single quotes 'value' → exp.Literal (correct) + - Double quotes "value" → exp.Column(quoted=True) (inconsistent with SQL, but handled) + + This test ensures our workaround in ensure_parenthesized() works correctly. + """ + db_name = "sr_e2e_quote_handling_db" + table_name = f"{db_name}.sr_quote_test_table" + + # Test with different quote forms in MODEL + model_sql = """ + MODEL ( + name test.quote_handling_model, + kind FULL, + dialect starrocks, + columns ( + id BIGINT, + dt DATE, + region VARCHAR(50), + customer_id INT + ), + physical_properties ( + -- Single quotes (correct way) - parses to Literal + primary_key = 'id, dt, region', + + partition_by = "date_trunc('day', dt), region", + + -- Double quotes (parser quirk) - parses to Column(quoted=True) + -- But our ensure_parenthesized handles this + order_by = "id, region", + + -- Structured form with single-quoted string + distributed_by = 'HASH(id) BUCKETS 8', + + -- Generic properties with single quotes + replication_num = '1', + -- storage_medium = "HDD" -- not valid in shared-data cluster + ) + ); + SELECT * + """ + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + # Parse MODEL and extract parameters (this is where quote handling happens) + params = self._parse_model_and_get_all_params(model_sql) + + # Log parsed parameters for debugging + logger.info(f"Parsed physical_properties: {params['table_properties']}") + for key, value in params["table_properties"].items(): + logger.info(f" {key}: {type(value).__name__} = {value}") + + # Create table with parsed parameters + starrocks_adapter.create_table(table_name, **params) + + # Verify via SHOW CREATE TABLE + show_create = fetchone_or_fail(starrocks_adapter, f"SHOW CREATE TABLE {table_name}") + ddl = show_create[1] + logger.info(f"Quote Handling Test DDL:\n{ddl}") + + # Precise assertions + import re + + # 1. Verify PRIMARY KEY (from single-quoted string 'id, dt') + pk_match = re.search(r"PRIMARY KEY\s*\(([^)]+)\)", ddl) + assert pk_match, "PRIMARY KEY clause not found" + pk_cols = pk_match.group(1) + assert "id" in pk_cols and "dt" in pk_cols, ( + f"Expected PRIMARY KEY (id, dt), got {pk_cols}. " + f"Single-quoted string 'id, dt' was not correctly parsed!" + ) + + # 2. Verify ORDER BY (from double-quoted string \"id, region\") + # This tests our Column(quoted=True) workaround + order_match = re.search(r"ORDER BY\s*\(([^)]+)\)", ddl) + assert order_match, "ORDER BY clause not found" + order_cols = order_match.group(1) + assert "id" in order_cols and "region" in order_cols, ( + f"Expected ORDER BY (id, region), got {order_cols}. " + f'Double-quoted string "id, region" was not correctly handled!' + ) + + # 3. Verify DISTRIBUTED BY (from single-quoted string) + assert "DISTRIBUTED BY HASH" in ddl, "DISTRIBUTED BY clause not found" + assert "customer_id" in ddl, "customer_id not found in DISTRIBUTED BY" + assert "BUCKETS 8" in ddl, "BUCKETS not found in DISTRIBUTED BY" + + # 4. Verify PROPERTIES (generic properties with single quotes) + assert "replication_num" in ddl, "replication_num not found in PROPERTIES" + # assert "storage_medium" in ddl or "HDD" in ddl, "storage_medium not found in PROPERTIES" + + # Functional test: Verify table actually works + starrocks_adapter.execute( + f"INSERT INTO {table_name} " + f"(id, dt, region, customer_id) " + f"VALUES (100, '2024-01-01', 'US', 1001)" + ) + + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT id, region, customer_id FROM {table_name} WHERE id = 100", + ) + assert result, "INSERT/SELECT failed" + assert result == (100, "US", 1001), f"Data mismatch: {result}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + +# ==================== StarRocks Native SQL Capabilities ==================== + + +class TestStarRocksAbility: + """ + Test StarRocks native SQL capabilities and limitations. + + This test class validates StarRocks database features by executing + raw SQL statements directly, without going through SQLMesh abstraction layers. + + Purpose: + - Document which SQL features are supported + - Verify expected failures for unsupported operations + - Guide adapter implementation decisions + + Note: Tests marked with @pytest.mark.xfail are EXPECTED to fail. + """ + + @pytest.fixture(scope="class") + def test_tables( + self, starrocks_adapter: StarRocksEngineAdapter + ) -> t.Generator[t.Dict[str, str], None, None]: + """ + Pre-create tables of different types for testing. + + Returns: + Dict mapping table type to fully qualified table name + """ + db_name = "sr_ability_test" + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + tables = {} + + # 1. PRIMARY KEY table + # Note: StarRocks PRIMARY KEY tables support complex DELETE operations (BETWEEN, subqueries, etc.) + pk_table = f"{db_name}.pk_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {pk_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) PRIMARY KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'pk_table'", + ) + assert result[0] == 1, f"PRIMARY KEY table {pk_table} creation failed" + tables["primary_key"] = pk_table + + # 2. DUPLICATE KEY table + dup_table = f"{db_name}.dup_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {dup_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) DUPLICATE KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'dup_table'", + ) + assert result[0] == 1, f"DUPLICATE KEY table {dup_table} creation failed" + tables["duplicate_key"] = dup_table + + # 3. UNIQUE KEY table + unique_table = f"{db_name}.unique_table" + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {unique_table} ( + id INT, + dt DATE, + name STRING, + status STRING + ) UNIQUE KEY (id, dt) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + # Verify table creation + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT COUNT(*) FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'unique_table'", + ) + assert result[0] == 1, f"UNIQUE KEY table {unique_table} creation failed" + tables["unique_key"] = unique_table + + yield tables + + # Cleanup + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== Schema Operations ==================== + + @pytest.mark.parametrize("sql_keyword", ["SCHEMA", "DATABASE"]) + def test_create_drop_keyword_support( + self, starrocks_adapter: StarRocksEngineAdapter, sql_keyword: str + ): + """ + Test both CREATE SCHEMA and CREATE DATABASE syntax. + + Expected: Both keywords should work (they are synonyms in StarRocks) + """ + test_name = f"sr_ability_{sql_keyword.lower()}" + + try: + # CREATE + starrocks_adapter.execute(f"CREATE {sql_keyword} IF NOT EXISTS {test_name}") + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'", + ) + assert result, f"CREATE {sql_keyword} failed" + + # DROP + starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") + result_optional: t.Optional[Row] = starrocks_adapter.fetchone( + f"SELECT SCHEMA_NAME FROM information_schema.SCHEMATA WHERE SCHEMA_NAME = '{test_name}'" + ) + assert result_optional is None, f"DROP {sql_keyword} failed" + + finally: + starrocks_adapter.execute(f"DROP {sql_keyword} IF EXISTS {test_name}") + + # ==================== DML Capabilities ==================== + + def test_insert_select_supported(self, starrocks_adapter: StarRocksEngineAdapter): + """Basic INSERT/SELECT support (raw SQL capability check).""" + db_name = "sr_ability_insert_select" + table_name = f"{db_name}.t" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + id INT, + name VARCHAR(100) + ) PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + starrocks_adapter.execute( + f"INSERT INTO {table_name} (id, name) VALUES (1, 'Alice'), (2, 'Bob')" + ) + rows = starrocks_adapter.fetchall(f"SELECT id, name FROM {table_name} ORDER BY id") + assert list(rows) == [(1, "Alice"), (2, "Bob")], f"Data mismatch: {rows}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_update_supported(self, starrocks_adapter: StarRocksEngineAdapter): + """Basic UPDATE support (raw SQL capability check).""" + db_name = "sr_ability_update" + table_name = f"{db_name}.t" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + id INT, + name VARCHAR(100) + ) PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + starrocks_adapter.execute(f"INSERT INTO {table_name} (id, name) VALUES (1, 'Alice')") + starrocks_adapter.execute( + f"UPDATE {table_name} SET name = 'Alice Updated' WHERE id = 1" + ) + result = fetchone_or_fail( + starrocks_adapter, f"SELECT name FROM {table_name} WHERE id = 1" + ) + assert result == ("Alice Updated",), f"UPDATE failed: {result}" + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== DELETE Operations - Success Cases ==================== + + @pytest.mark.parametrize( + "table_type,delete_clause,expected_remaining", + [ + # PRIMARY KEY table - full support + ("primary_key", "WHERE id = 1", 2), + ("primary_key", "WHERE dt BETWEEN '2024-01-01' AND '2024-06-30'", 1), + ( + "primary_key", + "WHERE id IN (SELECT id FROM {table} WHERE status = 'deleted')", + 2, + ), + ("primary_key", "WHERE TRUE", 0), + # PRIMARY KEY with USING (JOIN delete) + ( + "primary_key", + "USING {table} t2 WHERE {table}.id = t2.id AND t2.status = 'deleted'", + 2, + ), + # DUPLICATE/UNIQUE KEY - only simple WHERE + ("duplicate_key", "WHERE id = 1", 2), + ("unique_key", "WHERE id = 1", 2), + ], + ids=[ + "pk_simple_where", + "pk_between", + "pk_subquery", + "pk_where_true", + "pk_using_join", + "dup_simple_where", + "unique_simple_where", + ], + ) + def test_delete_supported_syntax( + self, + starrocks_adapter: StarRocksEngineAdapter, + test_tables: t.Dict[str, str], + table_type: str, + delete_clause: str, + expected_remaining: int, + ): + """ + Test DELETE operations that should succeed. + + Expected: DELETE succeeds and leaves expected number of rows + """ + table_name = test_tables[table_type] + + # Prepare test data for this specific test (better isolation) + # All tables have the same column structure: (id, dt, name, status) + test_data = """ + (1, '2024-01-15', 'Alice', 'active'), + (2, '2024-06-10', 'Bob', 'deleted'), + (3, '2024-12-05', 'Charlie', 'active') + """ + starrocks_adapter.execute(f"TRUNCATE TABLE {table_name}") + starrocks_adapter.execute(f"INSERT INTO {table_name} VALUES {test_data}") + + # Format delete clause (for subquery/using with table reference) + delete_sql = f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" + + # Debug: Log the SQL before execution + logger.info(f"Executing DELETE SQL: {delete_sql}") + + # Execute delete + starrocks_adapter.execute(delete_sql) + + # Verify result + count = fetchone_or_fail(starrocks_adapter, f"SELECT COUNT(*) FROM {table_name}")[0] + logger.info(f"After DELETE: {count} rows remaining (expected {expected_remaining})") + assert count == expected_remaining, ( + f"Expected {expected_remaining} rows, got {count} for {table_type} with {delete_clause}" + ) + + # ==================== DELETE Operations - Failure Cases ==================== + + syntax_error = "not supported|syntax error|getting analyzing error" + + @pytest.mark.parametrize( + "table_type,delete_clause,error_pattern", + [ + # DUPLICATE KEY - unsupported syntax + ( + "duplicate_key", + "WHERE dt BETWEEN '2024-01-01' AND '2024-12-31'", + syntax_error, + ), + ( + "duplicate_key", + "WHERE id IN (SELECT id FROM {table} WHERE status = 'deleted')", + syntax_error, + ), + ("duplicate_key", "WHERE TRUE", syntax_error), + # UNIQUE KEY - unsupported syntax + ( + "unique_key", + "WHERE dt BETWEEN '2024-01-01' AND '2024-12-31'", + syntax_error, + ), + ("unique_key", "WHERE id IN (SELECT id FROM {table})", syntax_error), + ], + ids=[ + "dup_between_unsupported", + "dup_subquery_unsupported", + "dup_where_true_unsupported", + "unique_between_unsupported", + "unique_subquery_unsupported", + ], + ) + def test_delete_unsupported_syntax( + self, + starrocks_adapter: StarRocksEngineAdapter, + test_tables: t.Dict[str, str], + table_type: str, + delete_clause: str, + error_pattern: str, + ): + """ + Test DELETE operations that should fail on non-PRIMARY KEY tables. + + Expected: DELETE fails with specific error message. + """ + table_name = test_tables[table_type] + delete_sql = f"DELETE FROM {table_name} {delete_clause.format(table=table_name)}" + + # This should raise an exception + with pytest.raises(Exception) as exc_info: + starrocks_adapter.execute(delete_sql) + + # Verify error message matches expected pattern + import re + + error_msg = str(exc_info.value).lower() + assert re.search(error_pattern, error_msg), ( + f"Expected error pattern '{error_pattern}', got: {exc_info.value}" + ) + + # ==================== COMMENT Syntax Tests ==================== + + @pytest.mark.parametrize( + "comment_type,sql_template", + [ + # Table comment variants + ("table_standard", "ALTER TABLE {table} COMMENT = '{comment}'"), + # Failed without `=` + # ("table_standard", "ALTER TABLE {table} COMMENT '{comment}'"), + # No MODIFY keyworkd + # ("table_modify", 'ALTER TABLE {table} MODIFY COMMENT "{comment}"'), + # # Column comment variants + ( + "column_no_type", + "ALTER TABLE {table} MODIFY COLUMN {column} COMMENT '{comment}'", + ), + # it will take some time to change the column type + # ("column_with_type", "ALTER TABLE {table} MODIFY COLUMN {column} BIGINT COMMENT '{comment}'"), + ], + ids=[ + "table_comment_standard", + # "table_comment_standard_without_equal", # FAIL + # "table_comment_modify", # FAIL + "column_comment_no_type", + # "column_comment_with_type", + ], + ) + def test_comment_syntax_variants( + self, + starrocks_adapter: StarRocksEngineAdapter, + comment_type: str, + sql_template: str, + ): + """ + Test different COMMENT syntax variations to determine StarRocks support. + + Purpose: Guide whether we need to override comment methods in adapter + """ + db_name = "sr_ability_comment" + table_name = f"{db_name}.test_comment" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT, + col1 INT + ) + DUPLICATE KEY (id) -- key columns can't be changed. + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Generate SQL based on template + if "table" in comment_type: + sql = sql_template.format(table=table_name, comment=f"test {comment_type}") + else: # column + sql = sql_template.format( + table=table_name, column="col1", comment=f"test {comment_type}" + ) + + # Try to execute + try: + starrocks_adapter.execute(sql) + + # Verify comment was set + if "table" in comment_type: + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment'", + )[0] + assert f"test {comment_type}" in result, ( + f"Comment not set correctly for {comment_type}" + ) + else: # column + result_row = fetchone_or_fail( + starrocks_adapter, + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_comment' " + f"AND COLUMN_NAME = 'col1'", + ) + logger.info(f"Column comment: {result_row}") + result = result_row[1] + assert f"test {comment_type}" in result, ( + f"Comment not set correctly for {comment_type}" + ) + + logger.info(f"✅ {comment_type}: SUPPORTED") + + except Exception as e: + logger.warning(f"❌ {comment_type}: NOT SUPPORTED - {e}") + # Re-raise for test failure + raise + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + # ==================== Quote Type Tests ==================== + + @pytest.mark.parametrize( + "quote_type,comment_value", + [ + ("single", "single quotes"), + ("double", "double quotes"), + ("escaped_single", "It\\'s a test"), + ("escaped_double", 'Say \\"hello\\"'), + ], + ids=["single_quotes", "double_quotes", "escaped_single", "escaped_double"], + ) + def test_comment_quote_types( + self, + starrocks_adapter: StarRocksEngineAdapter, + quote_type: str, + comment_value: str, + ): + """ + Test different quote types in COMMENT clauses. + + Purpose: Determine which quote types StarRocks accepts + """ + db_name = "sr_ability_quotes" + table_name = f"{db_name}.test_quotes" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} (id INT) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Build SQL with appropriate quotes + if "single" in quote_type: + sql = f"ALTER TABLE {table_name} COMMENT = '{comment_value}'" + else: # double + sql = f'ALTER TABLE {table_name} COMMENT = "{comment_value}"' + + starrocks_adapter.execute(sql) + logger.info(f"✅ {quote_type}: SUPPORTED") + + except Exception as e: + logger.warning(f"❌ {quote_type}: NOT SUPPORTED - {e}") + raise + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_comment_in_create_table(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test COMMENT clauses in CREATE TABLE statement. + + Expected: Verify comments are registered during table creation + """ + db_name = "sr_ability_create_comment" + table_name = f"{db_name}.test_create_comment" + + try: + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + + # Create table with comments + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT COMMENT 'id column', + name VARCHAR(100) COMMENT 'name column' + ) + PRIMARY KEY (id) + COMMENT 'test table' + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Verify table comment + table_comment = fetchone_or_fail( + starrocks_adapter, + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'", + )[0] + assert table_comment == "test table", f"Table comment mismatch: {table_comment}" + + # Verify column comments + column_comments = {} + results = starrocks_adapter.fetchall( + f"SELECT COLUMN_NAME, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_create_comment'" + ) + for col_name, col_comment in results: + if col_comment: # Skip empty comments + column_comments[col_name] = col_comment + + assert column_comments.get("id") == "id column", ( + f"Column comment mismatch: {column_comments}" + ) + assert column_comments.get("name") == "name column", ( + f"Column comment mismatch: {column_comments}" + ) + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + +class TestCommentMethods: + """ + Test _build_create_comment_table_exp and _build_create_comment_column_exp methods. + + These methods are used to generate ALTER TABLE SQL for modifying comments. + Although StarRocks uses COMMENT_CREATION_TABLE = IN_SCHEMA_DEF_CTAS (comments + are included in CREATE TABLE), these methods may be used for: + - Modifying existing table comments + - View comments (depending on COMMENT_CREATION_VIEW) + - Future ALTER TABLE support + """ + + def test_build_create_comment_table_exp(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test _build_create_comment_table_exp generates correct ALTER TABLE COMMENT SQL. + + Verifies: + 1. Method generates correct SQL syntax + 2. SQL can be executed successfully + 3. Comment is actually updated in database + """ + db_name = "sr_test_comment_table" + table_name = f"{db_name}.test_table" + + try: + # Setup: Create schema and table + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT, + name VARCHAR(100) + ) + PRIMARY KEY (id) + COMMENT 'initial comment' + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Test: Use _build_create_comment_table_exp to generate SQL + table_expr = exp.to_table(table_name) + new_comment = "Updated table comment via method" + comment_sql = starrocks_adapter._build_create_comment_table_exp( + table=table_expr, table_comment=new_comment, table_kind="TABLE" + ) + + # Verify: SQL format is correct + assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" + assert "COMMENT =" in comment_sql, f"Missing COMMENT = in SQL: {comment_sql}" + assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" + + # Execute the generated SQL + starrocks_adapter.execute(comment_sql) + + # Verify: Comment was actually updated + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT TABLE_COMMENT FROM information_schema.TABLES " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table'", + ) + assert result, "Table not found after comment update" + assert result[0] == new_comment, ( + f"Comment not updated. Expected: {new_comment}, Got: {result[0]}" + ) + + logger.info("✅ _build_create_comment_table_exp generates valid SQL") + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) + + def test_build_create_comment_column_exp(self, starrocks_adapter: StarRocksEngineAdapter): + """ + Test _build_create_comment_column_exp generates correct ALTER TABLE MODIFY COLUMN SQL. + + Verifies: + 1. Method generates correct SQL with column type + 2. SQL can be executed successfully + 3. Column comment is actually updated in database + 4. Column type is preserved (not changed) + """ + db_name = "sr_test_comment_column" + table_name = f"{db_name}.test_table" + + try: + # Setup: Create schema and table + starrocks_adapter.create_schema(db_name, ignore_if_exists=True) + starrocks_adapter.execute( + f""" + CREATE TABLE {table_name} ( + id INT COMMENT 'initial id comment', + name VARCHAR(100) COMMENT 'initial name comment', + amount DECIMAL(10, 2) + ) + PRIMARY KEY (id) + DISTRIBUTED BY HASH(id) BUCKETS 10 + """ + ) + + # Test: Use _build_create_comment_column_exp to generate SQL + table_expr = exp.to_table(table_name) + new_comment = "Updated column comment via method" + comment_sql = starrocks_adapter._build_create_comment_column_exp( + table=table_expr, + column_name="name", + column_comment=new_comment, + table_kind="TABLE", + ) + + # Verify: SQL format is correct + assert "ALTER TABLE" in comment_sql, f"Invalid SQL format: {comment_sql}" + assert "MODIFY COLUMN" in comment_sql, f"Missing MODIFY COLUMN in SQL: {comment_sql}" + assert "COMMENT" in comment_sql, f"Missing COMMENT in SQL: {comment_sql}" + assert new_comment in comment_sql, f"Comment not in SQL: {comment_sql}" + + # Execute the generated SQL + starrocks_adapter.execute(comment_sql) + + # Verify: Column comment was actually updated + result = fetchone_or_fail( + starrocks_adapter, + f"SELECT COLUMN_TYPE, COLUMN_COMMENT FROM information_schema.COLUMNS " + f"WHERE TABLE_SCHEMA = '{db_name}' AND TABLE_NAME = 'test_table' AND COLUMN_NAME = 'name'", + ) + assert result is not None, "Column not found after comment update" + column_type, column_comment = result + assert column_comment == new_comment, ( + f"Comment not updated. Expected: {new_comment}, Got: {column_comment}" + ) + assert "varchar(100)" in column_type.lower(), ( + f"Column type changed unexpectedly: {column_type}" + ) + + logger.info("✅ _build_create_comment_column_exp generates valid SQL with correct type") + + finally: + starrocks_adapter.drop_schema(db_name, ignore_if_not_exists=True) diff --git a/tests/core/engine_adapter/test_starrocks.py b/tests/core/engine_adapter/test_starrocks.py new file mode 100644 index 0000000000..46f9b5be08 --- /dev/null +++ b/tests/core/engine_adapter/test_starrocks.py @@ -0,0 +1,1814 @@ +"""Tests for StarRocks Engine Adapter + +This test suite covers the StarRocks-specific functionality of the engine adapter, +including schema operations, table operations, and StarRocks-specific table properties. + +Test classes are organized by functionality (following the standard order): +- TestSchemaOperations: Schema/Database operations +- TestTableOperations: Basic table operations +- TestKeyPropertyBuilding: Table key types (primary_key, duplicate_key, unique_key, aggregate_key) +- TestPartitionPropertyBuilding: Partition (partitioned_by, partitions) +- TestDistributionPropertyBuilding: Distribution (distributed_by) +- TestOrderByPropertyBuilding: Order By (order_by, clustered_by) +- TestCommentPropertyBuilding: Comments (table and column) +- TestGenericPropertyBuilding: Generic properties (replication_num, etc.) +- TestComprehensive: Comprehensive tests with all features combined + +Unit tests use @pytest.mark.parametrize to systematically cover all value forms. +""" + +import typing as t + +import pytest +from sqlglot import expressions as exp +from sqlglot import parse_one +from pytest_mock.plugin import MockerFixture +from sqlmesh.core.engine_adapter.shared import DataObjectType +from sqlmesh.utils.errors import SQLMeshError + +from tests.core.engine_adapter import to_sql_calls +from sqlmesh.core.engine_adapter.starrocks import StarRocksEngineAdapter +from sqlmesh.core.dialect import parse +from sqlmesh.core.model import load_sql_based_model, SqlModel + +pytestmark = [pytest.mark.starrocks, pytest.mark.engine] + + +def _load_sql_model(model_sql: str) -> SqlModel: + """Parse StarRocks MODEL SQL into a SqlModel instance.""" + expressions = parse(model_sql, default_dialect="starrocks") + return t.cast(SqlModel, load_sql_based_model(expressions)) + + +def _columns(model: SqlModel) -> t.Dict[str, exp.DataType]: + assert model.columns_to_types is not None + return model.columns_to_types + + +# ============================================================================= +# Schema Operations +# ============================================================================= +class TestSchemaOperations: + """Tests for schema (database) operations.""" + + def test_create_schema( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE DATABASE statement generation. + + StarRocks uses DATABASE keyword (MySQL-style) instead of SCHEMA. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_schema("test_schema") + + assert to_sql_calls(adapter) == [ + "CREATE SCHEMA IF NOT EXISTS `test_schema`", + ] + + def test_create_schema_without_if_exists( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE DATABASE without IF NOT EXISTS clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_schema("test_schema", ignore_if_exists=False) + + assert to_sql_calls(adapter) == [ + "CREATE SCHEMA `test_schema`", + ] + + def test_drop_schema(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): + """Test DROP DATABASE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.drop_schema("test_schema") + adapter.drop_schema("test_schema", ignore_if_not_exists=False) + + assert to_sql_calls(adapter) == [ + "DROP SCHEMA IF EXISTS `test_schema`", + "DROP SCHEMA `test_schema`", + ] + + +# ============================================================================= +# Data Object Query (MV vs VIEW) +# ============================================================================= +class TestDataObjectQuery: + def test_get_data_object_materialized_view_is_distinguished_from_view( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ) -> None: + """ + StarRocks may report materialized views as TABLE_TYPE='VIEW' in information_schema.tables. + Ensure StarRocksEngineAdapter upgrades MV objects using information_schema.materialized_views. + """ + import pandas as pd + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter, patch_get_data_objects=False) + + # information_schema.tables output (MV appears as 'view') + # fetchdf is called twice: + # 1) information_schema.tables + # 2) information_schema.materialized_views + tables_df = pd.DataFrame( + [ + {"schema_name": "test_db", "name": "mv1", "type": "view"}, + {"schema_name": "test_db", "name": "mv2", "type": "view"}, + {"schema_name": "test_db", "name": "v1", "type": "view"}, + {"schema_name": "test_db", "name": "t1", "type": "table"}, + ] + ) + mv_df = pd.DataFrame( + [ + {"schema_name": "test_db", "name": "mv1"}, + {"schema_name": "test_db", "name": "mv2"}, + ] + ) + + known_names = ["mv1", "mv2", "v1", "t1"] + + def fetchdf_side_effect(query: exp.Expression, *_: t.Any, **__: t.Any): + query_sql = query.sql(dialect="starrocks").lower() + requested = [ + name for name in known_names if f"'{name}'" in query_sql or f"`{name}`" in query_sql + ] + if "information_schema.materialized_views" in query_sql: + df = mv_df + else: + df = tables_df + if requested: + mask = df["name"].str.lower().isin(requested) + return df[mask].reset_index(drop=True) + return df.reset_index(drop=True) + + adapter.fetchdf = mocker.Mock(side_effect=fetchdf_side_effect) # type: ignore[assignment] + + mv1 = adapter.get_data_object("test_db.mv1") + assert mv1 is not None + assert mv1.type == DataObjectType.MATERIALIZED_VIEW + + v1 = adapter.get_data_object("test_db.v1") + assert v1 is not None + assert v1.type == DataObjectType.VIEW + + mv2_objects = adapter.get_data_objects(schema_name="test_db", object_names={"mv2"}) + assert len(mv2_objects) == 1 + assert mv2_objects[0].name.lower() == "mv2" + assert mv2_objects[0].type == DataObjectType.MATERIALIZED_VIEW + + +# ============================================================================= +# Basic Table Operations +# ============================================================================= +class TestTableOperations: + """Tests for basic table operations.""" + + def test_create_table( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test basic CREATE TABLE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "a": exp.DataType.build("INT"), + "b": exp.DataType.build("VARCHAR(100)"), + }, + ) + + sql = to_sql_calls(adapter)[0] + assert "CREATE TABLE IF NOT EXISTS `test_table`" in sql + assert "`a` INT" in sql + assert "`b` VARCHAR(100)" in sql + + def test_create_table_like( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE statement.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("target_table", "source_table") + assert to_sql_calls(adapter) == [ + "CREATE TABLE IF NOT EXISTS `target_table` LIKE `source_table`", + ] + + def test_create_table_like_exists_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE with exists=False (no IF NOT EXISTS).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("target_table", "source_table", exists=False) + assert to_sql_calls(adapter) == [ + "CREATE TABLE `target_table` LIKE `source_table`", + ] + + def test_create_table_like_qualified_names( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE LIKE with database-qualified names.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table_like("db.target_table", "db.source_table") + assert to_sql_calls(adapter) == [ + "CREATE TABLE IF NOT EXISTS `db`.`target_table` LIKE `db`.`source_table`", + ] + + def test_create_table_like_does_not_call_columns( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ): + """ + StarRocks overrides _create_table_like to use native CREATE TABLE LIKE and should + not fall back to the base implementation (which calls columns(source)). + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + columns_mock = mocker.patch.object( + adapter, "columns", side_effect=AssertionError("columns() should not be called") + ) + + adapter.create_table_like("target_table", "source_table") + assert columns_mock.call_count == 0 + + def test_create_table_like_clears_cache( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + mocker: MockerFixture, + ): + """create_table_like should clear the data object cache for the target table.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + clear_cache = mocker.patch.object(adapter, "_clear_data_object_cache") + + adapter.create_table_like("target_table", "source_table") + clear_cache.assert_called_once_with("target_table") + + def test_rename_table( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test RENAME TABLE statement.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Test 1: Simple table names (no database qualifier) + adapter.rename_table("old_table", "new_table") + adapter.cursor.execute.assert_called_with("ALTER TABLE `old_table` RENAME `new_table`") + + # Test 2: Database-qualified names - RENAME only uses table name + adapter.cursor.execute.reset_mock() + adapter.rename_table("db.old_table", "db.new_table") + # StarRocks RENAME clause requires unqualified table name + adapter.cursor.execute.assert_called_with("ALTER TABLE `db`.`old_table` RENAME `new_table`") + + def test_delete_from(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): + """Test DELETE statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.delete_from(exp.to_table("test_table"), "id = 1") + + assert to_sql_calls(adapter) == [ + "DELETE FROM `test_table` WHERE `id` = 1", + ] + + def test_create_index( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE INDEX statement - StarRocks doesn't support standalone indexes.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_index("test_table", "idx_name", ("cola",)) + + # StarRocks skips index creation - verify no execute call was made + adapter.cursor.execute.assert_not_called() + + def test_create_view(self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter]): + """Test CREATE VIEW statement generation.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view("test_view", parse_one("SELECT a FROM tbl")) + adapter.create_view("test_view", parse_one("SELECT a FROM tbl"), replace=False) + + assert to_sql_calls(adapter) == [ + "CREATE OR REPLACE VIEW `test_view` AS SELECT `a` FROM `tbl`", + "CREATE VIEW `test_view` AS SELECT `a` FROM `tbl`", + ] + + def test_create_view_with_security( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE VIEW with StarRocks SECURITY property.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_view", + parse_one("SELECT a FROM tbl"), + replace=False, + view_properties={"security": exp.Var(this="INVOKER")}, + ) + + sql = to_sql_calls(adapter)[0] + assert "SECURITY INVOKER" in sql + + def test_create_materialized_view_replace_with_refresh_and_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE MATERIALIZED VIEW generation (drop+create, refresh, comments, schema).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_mv", + parse_one("SELECT a FROM tbl"), + materialized=True, + target_columns_to_types={"a": exp.DataType.build("INT")}, + table_description="Test MV description", + column_descriptions={"a": "Column A description"}, + view_properties={ + "refresh_moment": exp.Var(this="IMMEDIATE"), + "refresh_scheme": exp.Literal.string( + "ASYNC START ('2025-01-01 00:00:00') EVERY (INTERVAL 5 MINUTE)" + ), + }, + ) + + calls = to_sql_calls(adapter) + assert calls[0] == "DROP MATERIALIZED VIEW IF EXISTS `test_mv`" + assert "CREATE MATERIALIZED VIEW" in calls[1] + assert "COMMENT 'Test MV description'" in calls[1] + assert "COMMENT 'Column A description'" in calls[1] + assert "REFRESH IMMEDIATE ASYNC" in calls[1] + assert "START ('2025-01-01 00:00:00')" in calls[1] + assert "EVERY (INTERVAL 5 MINUTE)" in calls[1] + + def test_delete_where_true_optimization( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test DELETE with WHERE TRUE optimization. + + WHERE TRUE is converted to TRUNCATE TABLE for better performance. + This works for all StarRocks table types and is semantically equivalent. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Test WHERE TRUE + adapter.delete_from(exp.to_table("test_table"), exp.true()) + assert to_sql_calls(adapter) == [ + "TRUNCATE TABLE `test_table`", + ] + + adapter.cursor.reset_mock() + + # Test no WHERE clause (also uses TRUNCATE) + adapter.delete_from(exp.to_table("test_table"), None) + assert to_sql_calls(adapter) == [ + "TRUNCATE TABLE `test_table`", + ] + + +# ============================================================================= +# WHERE Clause Transformations +# ============================================================================= +class TestWhereClauseTransformations: + """ + Tests for WHERE clause transformations in DELETE statements. + + StarRocks has limitations on DELETE WHERE clauses for non-PRIMARY KEY tables: + - BETWEEN is not supported → converted to >= AND <= + - Boolean literals (TRUE/FALSE) are not supported → removed or converted to 1=1/1=0 + + These transformations are applied conservatively to all DELETE statements since + table type cannot be easily determined at DELETE time. + """ + + def test_delete_with_between_simple( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test BETWEEN is converted to >= AND <= in DELETE WHERE. + + StarRocks Limitation: + BETWEEN is not supported in DELETE WHERE for DUPLICATE/UNIQUE/AGGREGATE KEY tables. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31'"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "AND" in sql + + def test_delete_with_between_numeric( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN with numeric values.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id BETWEEN 100 AND 200"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`id` >= 100" in sql + assert "`id` <= 200" in sql + + def test_delete_with_between_and_other_conditions( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN combined with other WHERE conditions.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # Complex WHERE: id > 50 AND dt BETWEEN '2024-01-01' AND '2024-12-31' + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 50 AND dt BETWEEN '2024-01-01' AND '2024-12-31'"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`id` > 50" in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + + def test_delete_with_multiple_between( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test multiple BETWEEN expressions in one WHERE clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31' AND id BETWEEN 1 AND 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "`id` >= 1" in sql + assert "`id` <= 100" in sql + + def test_delete_with_and_true( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test AND TRUE is removed from WHERE clause. + + StarRocks Limitation: + Boolean literals are not supported in WHERE clauses. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 100 AND TRUE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + assert "`id` > 100" in sql + # Should not have extra AND + assert sql.count("AND") == 0 + + def test_delete_with_true_and_condition( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test TRUE AND condition (reverse order).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("TRUE AND id > 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + assert "`id` > 100" in sql + + def test_delete_with_or_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test OR FALSE is removed from WHERE clause.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("id > 100 OR FALSE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + assert "`id` > 100" in sql + assert sql.count("OR") == 0 + + def test_delete_with_false_or_condition( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test FALSE OR condition (reverse order).""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one("FALSE OR id > 100"), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + assert "`id` > 100" in sql + + def test_delete_with_standalone_false( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test standalone FALSE is converted to 1=0.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + exp.false(), + ) + + sql = to_sql_calls(adapter)[0] + assert "FALSE" not in sql + # Converted to 1=0 (always false condition) + assert "1 = 0" in sql or "1=0" in sql + + def test_delete_with_combined_transformations( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test BETWEEN + boolean literals together. + + Verifies that multiple transformations work correctly when combined. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # WHERE: dt BETWEEN '2024-01-01' AND '2024-12-31' AND TRUE + adapter.delete_from( + exp.to_table("test_table"), + parse_one("dt BETWEEN '2024-01-01' AND '2024-12-31' AND TRUE"), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + assert "TRUE" not in sql + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + + def test_delete_with_nested_boolean_expressions( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test nested boolean expressions with multiple levels.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + # WHERE: (id > 100 AND TRUE) OR (name = 'test' AND FALSE) + # After transformation: id > 100 OR (name = 'test' AND FALSE) + # After transformation: id > 100 OR FALSE + # After transformation: id > 100 + adapter.delete_from( + exp.to_table("test_table"), + parse_one("(id > 100 AND TRUE) OR (name = 'test' AND FALSE)"), + ) + + sql = to_sql_calls(adapter)[0] + assert "TRUE" not in sql + # Note: The AND FALSE cannot be fully simplified without more complex logic + # Our transformation only handles direct AND TRUE / OR FALSE at the binary level + + def test_delete_with_between_in_complex_expression( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test BETWEEN within a complex nested expression.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.delete_from( + exp.to_table("test_table"), + parse_one( + "(dt BETWEEN '2024-01-01' AND '2024-06-30') OR (dt BETWEEN '2024-07-01' AND '2024-12-31')" + ), + ) + + sql = to_sql_calls(adapter)[0] + assert "BETWEEN" not in sql + # First BETWEEN converted + assert "`dt` >= '2024-01-01'" in sql + assert "`dt` <= '2024-06-30'" in sql + # Second BETWEEN converted + assert "`dt` >= '2024-07-01'" in sql + assert "`dt` <= '2024-12-31'" in sql + assert "OR" in sql + + +# ============================================================================= +# Key Property Building +# ============================================================================= +class TestKeyPropertyBuilding: + """ + Tests for table key types: primary_key, duplicate_key, unique_key, aggregate_key. + + Key columns must be the first N columns in the table definition. + Tests parse actual Model SQL to ensure real-world compatibility. + """ + + @pytest.mark.parametrize( + "key_type,key_value,expected_clause", + [ + # primary_key - single column + ("primary_key", "id", "PRIMARY KEY (`id`)"), + # primary_key - tuple form (multi-column) + ("primary_key", "(id, dt)", "PRIMARY KEY (`id`, `dt`)"), + # duplicate_key - tuple form + ("duplicate_key", "(id, name)", "DUPLICATE KEY (`id`, `name`)"), + # unique_key - tuple form + ("unique_key", "(id, dt)", "UNIQUE KEY (`id`, `dt`)"), + # aggregate_key - multi-column. not supported (requires aggregation function specification) + # ("aggregate_key", ("id", "dt"), "AGGREGATE KEY (`id`, `dt`)"), + ], + ) + def test_key_types_with_tuple_form( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + key_type: str, + key_value: str, + expected_clause: str, + ): + """Test key types with tuple form: (id, dt) parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, name STRING, value DECIMAL(10,2)), + physical_properties ( + {key_type} = {key_value} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + @pytest.mark.parametrize( + "key_string,expected_clause", + [ + # String with parentheses + ('"(id, dt)"', "PRIMARY KEY (`id`, `dt`)"), + # String without parentheses (auto-wrapped) + ('"id, dt"', "PRIMARY KEY (`id`, `dt`)"), + # Single column string + ('"id"', "PRIMARY KEY (`id`)"), + ], + ) + def test_primary_key_string_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + key_string: str, + expected_clause: str, + ): + """Test primary_key with string forms (with/without parentheses) parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, value DECIMAL(10,2)), + physical_properties ( + primary_key = {key_string} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + def test_primary_key_single_identifier( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test primary_key = id (single identifier without quotes).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + primary_key = id + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`id`)" in sql + + def test_primary_key_via_table_properties_tuple( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test primary_key passed via physical_properties with tuple form - duplicate of test_key_types_with_tuple_form.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, value DECIMAL(10,2)), + physical_properties ( + primary_key = (id, dt) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`id`, `dt`)" in sql + + def test_column_reordering_for_key( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test column reordering for key tables. + + StarRocks Requirement: + Key columns MUST be the first N columns in CREATE TABLE statement. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + columns_to_types = { + "customer_id": exp.DataType.build("INT"), + "region": exp.DataType.build("VARCHAR(50)"), + "order_id": exp.DataType.build("BIGINT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(18,2)"), + } + + adapter.create_table( + "test_table", + target_columns_to_types=columns_to_types, + primary_key=("order_id", "event_date"), + ) + + sql = to_sql_calls(adapter)[0] + assert "PRIMARY KEY (`order_id`, `event_date`)" in sql + + import re + + col_match = re.search(r"CREATE TABLE.*?\((.*)\)\s*PRIMARY KEY", sql, re.DOTALL) + assert col_match, "Could not extract column definitions" + col_defs = col_match.group(1) + + order_id_pos = col_defs.find("`order_id`") + event_date_pos = col_defs.find("`event_date`") + customer_id_pos = col_defs.find("`customer_id`") + + assert order_id_pos < event_date_pos, "order_id must appear before event_date" + assert event_date_pos < customer_id_pos, "event_date must appear before customer_id" + + +# ============================================================================= +# Partition Property Building +# ============================================================================= +class TestPartitionPropertyBuilding: + """Tests for partitioned_by/partition_by and partitions properties.""" + + @pytest.mark.parametrize( + "partition_expr,expected_clause", + [ + # Expression partitioning - single column + ("'dt'", "PARTITION BY (dt)"), + # Expression partitioning - multi-column + ("(year, month)", "PARTITION BY (year, month)"), + # RANGE partitioning + ("RANGE (dt)", "PARTITION BY RANGE (`dt`) ()"), + # LIST partitioning + ("LIST (region)", "PARTITION BY LIST (`region`) ()"), + ], + ) + def test_partitioned_by_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + partition_expr: str, + expected_clause: str, + ): + """Test partition_by with various forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE, year INT, month INT, region STRING), + physical_properties ( + partition_by = {partition_expr} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + def test_partition_by_alias( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partition_by as alias for partitioned_by in physical_properties.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, year INT, month INT), + physical_properties ( + partition_by = (year, month) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION BY (year, month)" in sql + + def test_partitioned_by_as_model_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partitioned_by as model-level parameter (not in physical_properties).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, year INT, month INT, value DECIMAL(10,2)), + partitioned_by (year, month) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + partitioned_by=model.partitioned_by, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION BY (year, month)" in sql + + def test_partitions_value_forms( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test partitions property with single and multiple partition definitions.""" + # Single partition string (paren) + model_sql_single = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + partition_by = RANGE(dt), + partitions = 'PARTITION p1 VALUES LESS THAN ("2024-01-01")' + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql_single, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION p1" in sql + assert "VALUES LESS THAN" in sql + + # Multiple partitions (tuple of strings) + model_sql_multiple = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, dt DATE), + physical_properties ( + partition_by = RANGE(dt), + partitions = ( + 'PARTITION p1 VALUES LESS THAN ("2024-01-01")', + 'PARTITION p2 VALUES LESS THAN ("2024-02-01")' + ) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql_multiple, default_dialect="starrocks") + model = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "PARTITION p1" in sql + assert "PARTITION p2" in sql + + +# ============================================================================= +# Distribution Property Building +# ============================================================================= +class TestDistributionPropertyBuilding: + """Tests for distributed_by property.""" + + @pytest.mark.parametrize( + "dist_input,expected_clause", + [ + # String form: HASH single column + ('"HASH(id) BUCKETS 10"', "DISTRIBUTED BY HASH (`id`) BUCKETS 10"), + # String form: HASH multi-column + ( + '"HASH(id, region) BUCKETS 16"', + "DISTRIBUTED BY HASH (`id`, `region`) BUCKETS 16", + ), + # String form: RANDOM + ('"RANDOM"', "DISTRIBUTED BY RANDOM"), + # String form: RANDOM with BUCKETS + ('"RANDOM BUCKETS 10"', "DISTRIBUTED BY RANDOM BUCKETS 10"), + ], + ) + def test_distributed_by_string_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + dist_input: str, + expected_clause: str, + ): + """Test distributed_by with string forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, region STRING), + physical_properties ( + distributed_by = {dist_input} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + @pytest.mark.parametrize( + "dist_struct,expected_clause", + [ + # Structured: HASH with quoted kind + ("(kind='HASH', expressions=id, buckets=32)", "DISTRIBUTED BY HASH (`id`) BUCKETS 32"), + # Structured: HASH with unquoted kind (Column) + ("(kind=HASH, expressions=id, buckets=10)", "DISTRIBUTED BY HASH (`id`) BUCKETS 10"), + # Structured: HASH multi-column + ( + "(kind='HASH', expressions=(a, b), buckets=16)", + "DISTRIBUTED BY HASH (`a`, `b`) BUCKETS 16", + ), + # Structured: RANDOM + ("(kind='RANDOM')", "DISTRIBUTED BY RANDOM"), + # Structured: RANDOM with buckets + ("(kind=RANDOM, buckets=10)", "DISTRIBUTED BY RANDOM BUCKETS 10"), + ], + ) + def test_distributed_by_structured_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + dist_struct: str, + expected_clause: str, + ): + """Test distributed_by with structured tuple forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, a INT, b STRING, region STRING), + physical_properties ( + distributed_by = {dist_struct} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql + + +# ============================================================================= +# Order By Property Building +# ============================================================================= +class TestOrderByPropertyBuilding: + """Tests for order_by and clustered_by properties.""" + + @pytest.mark.parametrize( + "order_value,expected_clause,description", + [ + # String form (double-quoted string) + ('"id"', "ORDER BY (`id`)", "Bare string: single column"), + ( + '"id, timestamp"', + "ORDER BY (`id`, `timestamp`)", + "Bare string: multi-column without parens", + ), + ('"(id, timestamp)"', "ORDER BY (`id`, `timestamp`)", "String with parens"), + # Literal form (single-quoted string) + ("'id'", "ORDER BY (`id`)", "Bare string: single column"), + ( + "'id, timestamp'", + "ORDER BY (`id`, `timestamp`)", + "Bare string: multi-column without parens", + ), + ("'(id, timestamp)'", "ORDER BY (`id`, `timestamp`)", "String with parens"), + # Tuple form (direct expression construction in MODEL) + ("(id, timestamp)", "ORDER BY (`id`, `timestamp`)", "Tuple: multi-column"), + # Single identifier (unquoted) + ("id", "ORDER BY (`id`)", "Identifier: single column"), + ], + ) + def test_order_by_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + order_value: str, + expected_clause: str, + description: str, + ): + """Test ORDER BY with various input forms parsed from physical_properties.""" + model_sql = f""" + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + physical_properties ( + order_by = {order_value} + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_clause in sql, ( + f"\nTest case: {description}\n" + f"Input: {order_value}\n" + f"Expected: {expected_clause}\n" + f"Actual SQL: {sql}" + ) + + def test_clustered_by_generates_order_by( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test that clustered_by parameter generates ORDER BY clause.""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + physical_properties ( + clustered_by = (id, timestamp) + ) + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + table_properties=model.physical_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert "ORDER BY (`id`, `timestamp`)" in sql + assert "CLUSTER BY" not in sql + + def test_clustered_by_as_model_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test clustered_by as model-level parameter (not in physical_properties).""" + model_sql = """ + MODEL ( + name t, + kind FULL, + dialect starrocks, + columns (id INT, timestamp DATETIME, value DECIMAL(10,2)), + clustered_by id + ); + SELECT 1; + """ + + parsed = parse(model_sql, default_dialect="starrocks") + model: SqlModel = t.cast(SqlModel, load_sql_based_model(parsed)) + + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + model.name, + _columns(model), + clustered_by=model.clustered_by, + ) + + sql = to_sql_calls(adapter)[0] + assert "ORDER BY (`id`)" in sql + # Verify that StarRocks uses ORDER BY, not CLUSTER BY + assert "CLUSTER BY" not in sql + + +# ============================================================================= +# Generic Property Building +# ============================================================================= +class TestGenericPropertyBuilding: + """Tests for generic table properties (replication_num, etc.).""" + + @pytest.mark.parametrize( + "prop_name,prop_value,expected_in_sql", + [ + # Integer value + ("replication_num", "1", "'replication_num'='1'"), + ("replication_num", "3", "'replication_num'='3'"), + # Boolean TRUE + ("enable_persistent_index", "TRUE", "'enable_persistent_index'='TRUE'"), + # Boolean FALSE + ("in_memory", "FALSE", "'in_memory'='FALSE'"), + # String value + ("compression", "LZ4", "'compression'='LZ4'"), + ], + ) + def test_generic_property_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + prop_name: str, + prop_value: str, + expected_in_sql: str, + ): + """Test generic properties with various value types.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + adapter.create_table( + "test_table", + target_columns_to_types={ + "id": exp.DataType.build("INT"), + "name": exp.DataType.build("VARCHAR(100)"), + }, + primary_key=("id",), + table_properties={ + prop_name: prop_value, + }, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_in_sql in sql + + +# ============================================================================= +# View Property Building +# ============================================================================= +class TestViewPropertyBuilding: + """Tests for StarRocks-specific view properties (SECURITY).""" + + @pytest.mark.parametrize( + "property_sql,expected_fragment", + [ + ("INVOKER", "SECURITY INVOKER"), + ("'INVOKER'", "SECURITY INVOKER"), + ("invoker", "SECURITY INVOKER"), + ("NONE", "SECURITY NONE"), + ], + ) + def test_security_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragment: str, + ): + """Ensure different input forms render SECURITY .""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + model_sql = f""" + MODEL ( + name test_schema.test_view_security, + kind VIEW, + dialect starrocks, + columns (c INT), + virtual_properties ( + security = {property_sql} + ) + ); + SELECT 1 AS c; + """ + model = _load_sql_model(model_sql) + + query = model.render_query() + adapter.create_view( + model.name, + query, + replace=False, + target_columns_to_types=_columns(model), + view_properties=model.virtual_properties, + ) + + sql = to_sql_calls(adapter)[0] + assert expected_fragment in sql + + def test_security_invalid_value( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Invalid SECURITY enum should raise SQLMeshError.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_view_security_invalid, + kind VIEW, + dialect starrocks, + columns (c INT), + virtual_properties ( + security = foo + ) + ); + SELECT 1 AS c; + """ + model = _load_sql_model(model_sql) + + query = model.render_query() + with pytest.raises(SQLMeshError, match="security"): + adapter.create_view( + model.name, + query, + replace=False, + target_columns_to_types=_columns(model), + view_properties=model.virtual_properties, + ) + + +# ============================================================================= +# Materialized View Refresh Property Building +# ============================================================================= +class TestMVRefreshPropertyBuilding: + """Tests for refresh_moment / refresh_scheme parsing and rendering.""" + + def _build_mv_model(self, property_sql: str) -> SqlModel: + model_sql = f""" + MODEL ( + name test_schema.test_mv_refresh_model, + kind VIEW, + dialect starrocks, + columns (a INT), + virtual_properties ( + {property_sql} + ) + ); + SELECT 1 AS a; + """ + return _load_sql_model(model_sql) + + def _create_simple_mv( + self, + adapter: StarRocksEngineAdapter, + model: SqlModel, + ) -> str: + query = model.render_query() + adapter.create_view( + "test_mv_refresh", + query, + replace=False, + materialized=True, + target_columns_to_types=_columns(model), + view_properties=model.virtual_properties, + ) + # replace=False → only CREATE statement is emitted + return to_sql_calls(adapter)[-1] + + @pytest.mark.parametrize( + "property_sql,expected_fragment", + [ + ("refresh_moment = IMMEDIATE", "REFRESH IMMEDIATE"), + ("refresh_moment = deferred", "REFRESH DEFERRED"), + ], + ) + def test_refresh_moment_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragment: str, + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model(property_sql) + sql = self._create_simple_mv(adapter, model) + assert expected_fragment in sql + + @pytest.mark.parametrize( + "property_sql,expected_fragments", + [ + ("refresh_scheme = ASYNC", ["REFRESH", "ASYNC"]), + # single quote value with single quote start + ( + "refresh_scheme = 'ASYNC START (''2025-01-01 00:00:00'') EVERY (INTERVAL 5 MINUTE)'", + [ + "REFRESH", + "ASYNC", + "START ('2025-01-01 00:00:00')", + "EVERY (INTERVAL 5 MINUTE)", + ], + ), + # single quote value with double quote start + ( + "refresh_scheme = 'ASYNC START (\"2025-02-01 00:00:00\") EVERY (INTERVAL 5 MINUTE)'", + [ + "REFRESH", + "ASYNC", + "START ('2025-02-01 00:00:00')", + "EVERY (INTERVAL 5 MINUTE)", + ], + ), + # double quote value with single quote start + ( + "refresh_scheme = \"async start ('2025-03-01') every (interval 10 minute)\"", + [ + "REFRESH", + "ASYNC", + "START ('2025-03-01')", + "EVERY (INTERVAL 10 MINUTE)", + ], + ), + ("refresh_scheme = MANUAL", ["REFRESH", "MANUAL"]), + ], + ) + def test_refresh_scheme_value_forms( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + property_sql: str, + expected_fragments: t.List[str], + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model(property_sql) + sql = self._create_simple_mv(adapter, model) + for fragment in expected_fragments: + assert fragment in sql + + def test_refresh_moment_invalid_value( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model("refresh_moment = AUTO") + with pytest.raises(SQLMeshError): + self._create_simple_mv(adapter, model) + + def test_refresh_scheme_invalid_prefix( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model = self._build_mv_model("refresh_scheme = 'SCHEDULE EVERY (INTERVAL 5 MINUTE)'") + with pytest.raises(SQLMeshError, match="refresh_scheme"): + self._create_simple_mv(adapter, model) + + +# ============================================================================= +# Comment Property Building +# ============================================================================= +class TestCommentPropertyBuilding: + """Tests for table and column comments.""" + + def test_table_and_column_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE TABLE with table and column comments.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "a": exp.DataType.build("INT"), + "b": exp.DataType.build("VARCHAR(100)"), + }, + table_description="Test table description", + column_descriptions={ + "a": "Column A description", + "b": "Column B description", + }, + ) + + sql = to_sql_calls(adapter)[0] + assert "COMMENT 'Test table description'" in sql + assert "COMMENT 'Column A description'" in sql + assert "COMMENT 'Column B description'" in sql + + def test_view_with_comments( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """Test CREATE VIEW with comments.""" + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_view( + "test_view", + parse_one("SELECT a FROM tbl"), + replace=False, + target_columns_to_types={"a": exp.DataType.build("INT")}, + table_description="Test view description", + column_descriptions={"a": "Column A description"}, + ) + + sql = to_sql_calls(adapter)[0] + assert "COMMENT 'Test view description'" in sql + assert "COMMENT 'Column A description'" in sql + + @pytest.mark.parametrize( + "table_name,comment,expected_sql", + [ + ( + "test_table", + "Test table comment", + "ALTER TABLE `test_table` COMMENT = 'Test table comment'", + ), + ( + "db.test_table", + "Database qualified table comment", + "ALTER TABLE `db`.`test_table` COMMENT = 'Database qualified table comment'", + ), + ( + "test_table", + "It's a test", + None, # Will check for escaped quote + ), + ], + ids=["simple_table", "qualified_table", "special_chars"], + ) + def test_build_create_comment_table_exp( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + table_name: str, + comment: str, + expected_sql: t.Optional[str], + ): + """ + Test _build_create_comment_table_exp generates correct ALTER TABLE COMMENT SQL. + + Verifies: + 1. SQL format: ALTER TABLE {table} COMMENT = '{comment}' + 2. No MODIFY keyword (StarRocks uses direct COMMENT =) + 3. Comment is properly quoted + 4. Table name is properly quoted + 5. Special characters are escaped + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table(table_name) + sql = adapter._build_create_comment_table_exp(table, comment, "TABLE") + + if expected_sql: + assert sql == expected_sql + else: + # Special chars case - check for escaped quote + assert "It's a test" in sql or "It''s a test" in sql + + # Common assertions for all cases + assert "ALTER TABLE" in sql + assert "COMMENT =" in sql + assert "MODIFY" not in sql # StarRocks doesn't use MODIFY for table comments + + def test_build_create_comment_table_exp_truncation( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test _build_create_comment_table_exp truncates long comments. + + Verifies comments longer than MAX_TABLE_COMMENT_LENGTH (2048) are truncated. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table("test_table") + long_comment = "x" * 3000 # Longer than MAX_TABLE_COMMENT_LENGTH (2048) + sql = adapter._build_create_comment_table_exp(table, long_comment, "TABLE") + + # The comment should be truncated to 2048 characters + expected_truncated = "x" * 2048 + assert expected_truncated in sql + assert "xxx" * 1000 not in sql # Verify it's actually truncated + + @pytest.mark.parametrize( + "table_name,column_name,comment,expected_sql", + [ + ( + "test_table", + "test_column", + "Test column comment", + "ALTER TABLE `test_table` MODIFY COLUMN `test_column` COMMENT 'Test column comment'", + ), + ( + "db.test_table", + "id", + "ID column", + "ALTER TABLE `db`.`test_table` MODIFY COLUMN `id` COMMENT 'ID column'", + ), + ], + ids=["simple_table", "qualified_table"], + ) + def test_build_create_comment_column_exp( + self, + make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter], + table_name: str, + column_name: str, + comment: str, + expected_sql: str, + ): + """ + Test _build_create_comment_column_exp generates correct ALTER TABLE MODIFY COLUMN SQL. + + Verifies: + 1. SQL format: ALTER TABLE {table} MODIFY COLUMN {column} COMMENT '{comment}' + 2. No column type required (StarRocks supports this) + 3. Comment is properly quoted + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table(table_name) + sql = adapter._build_create_comment_column_exp(table, column_name, comment, "TABLE") + + assert sql == expected_sql + # Should NOT contain column type + assert "VARCHAR" not in sql + assert "INT" not in sql + assert "BIGINT" not in sql + + def test_build_create_comment_column_exp_truncation( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test _build_create_comment_column_exp truncates long comments. + + Verifies comments longer than MAX_COLUMN_COMMENT_LENGTH (255) are truncated. + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + + table = exp.to_table("test_table") + long_comment = "y" * 500 # Longer than MAX_COLUMN_COMMENT_LENGTH (255) + sql = adapter._build_create_comment_column_exp(table, "test_col", long_comment, "TABLE") + + # The comment should be truncated to 255 characters + expected_truncated = "y" * 255 + assert expected_truncated in sql + assert "yyy" * 200 not in sql # Verify it's actually truncated + + +# ============================================================================= +# Invalid Property Scenarios +# ============================================================================= +class TestInvalidPropertyScenarios: + """Unit tests for property validation errors (mutual exclusivity, aliases, names).""" + + def test_key_type_mutually_exclusive( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_conflicting_keys, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + physical_properties ( + primary_key = (id), + unique_key = (id) + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + columns = _columns(model) + + with pytest.raises(SQLMeshError, match="Multiple table key type"): + adapter.create_table( + "test_conflicting_keys", + target_columns_to_types=columns, + table_properties=model.physical_properties, + ) + + def test_partition_alias_conflict_with_parameter( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_partition_conflict, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + partitioned_by (dt), + physical_properties ( + partition_by = (dt) + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + + with pytest.raises(SQLMeshError, match="partition definition"): + adapter.create_table( + model.name, + target_columns_to_types=_columns(model), + partitioned_by=model.partitioned_by, + table_properties=model.physical_properties, + ) + + def test_invalid_property_name_detection( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + model_sql = """ + MODEL ( + name test_schema.test_invalid_property, + kind FULL, + dialect starrocks, + columns ( + id INT, + dt DATE, + value INT + ), + physical_properties ( + partition = dt + ) + ); + SELECT id, dt, value FROM source_table; + """ + model = _load_sql_model(model_sql) + + with pytest.raises(SQLMeshError, match="Invalid property 'partition'"): + adapter.create_table( + model.name, + target_columns_to_types=_columns(model), + table_properties=model.physical_properties, + ) + + +# ============================================================================= +# Comprehensive Tests +# ============================================================================= +class TestComprehensive: + """Comprehensive tests combining multiple features.""" + + def test_create_table_comprehensive( + self, make_mocked_engine_adapter: t.Callable[..., StarRocksEngineAdapter] + ): + """ + Test CREATE TABLE with all features combined: + - PRIMARY KEY + - Table and column comments + - DISTRIBUTED BY + - ORDER BY + - Custom properties + """ + adapter = make_mocked_engine_adapter(StarRocksEngineAdapter) + adapter.create_table( + "test_table", + target_columns_to_types={ + "customer_id": exp.DataType.build("INT"), + "order_id": exp.DataType.build("BIGINT"), + "event_date": exp.DataType.build("DATE"), + "amount": exp.DataType.build("DECIMAL(10,2)"), + }, + primary_key=("order_id", "event_date"), + table_description="Sales transaction table", + column_descriptions={ + "customer_id": "Customer identifier", + "order_id": "Order identifier", + }, + table_properties={ + "distributed_by": exp.Tuple( + expressions=[ + exp.EQ( + this=exp.Column(this="kind"), + expression=exp.Literal.string("HASH"), + ), + exp.EQ( + this=exp.Column(this="expressions"), + expression=exp.Tuple(expressions=[exp.to_column("customer_id")]), + ), + exp.EQ( + this=exp.Column(this="buckets"), + expression=exp.Literal.number(10), + ), + ] + ), + "replication_num": "3", + }, + clustered_by=[exp.to_column("customer_id"), exp.to_column("order_id")], + ) + + sql = to_sql_calls(adapter)[0] + assert "CREATE TABLE IF NOT EXISTS `test_table`" in sql + assert "PRIMARY KEY (`order_id`, `event_date`)" in sql + assert "COMMENT 'Sales transaction table'" in sql + assert "COMMENT 'Customer identifier'" in sql + assert "COMMENT 'Order identifier'" in sql + assert "DISTRIBUTED BY HASH (`customer_id`) BUCKETS 10" in sql + assert "ORDER BY (`customer_id`, `order_id`)" in sql + assert "PROPERTIES ('replication_num'='3')" in sql diff --git a/tests/core/test_connection_config.py b/tests/core/test_connection_config.py index dd979a2551..1f51b8b990 100644 --- a/tests/core/test_connection_config.py +++ b/tests/core/test_connection_config.py @@ -20,6 +20,7 @@ MySQLConnectionConfig, PostgresConnectionConfig, SnowflakeConnectionConfig, + StarRocksConnectionConfig, TrinoAuthenticationMethod, AthenaConnectionConfig, MSSQLConnectionConfig, @@ -1947,3 +1948,59 @@ def test_schema_differ_overrides(make_config) -> None: adapter = config.create_engine_adapter() assert adapter._schema_differ_overrides == override assert adapter.schema_differ.parameterized_type_defaults == {} + + +def test_starrocks(make_config): + """Test StarRocksConnectionConfig basic functionality""" + # Basic configuration + config = make_config( + type="starrocks", + host="localhost", + user="root", + password="password", + port=9030, + database="testdb", + check_import=False, + ) + assert isinstance(config, StarRocksConnectionConfig) + assert config.type_ == "starrocks" + assert config.host == "localhost" + assert config.user == "root" + assert config.password == "password" + assert config.port == 9030 + assert config.database == "testdb" + assert config.DIALECT == "starrocks" + assert config.DISPLAY_NAME == "StarRocks" + assert config.DISPLAY_ORDER == 19 + assert config.is_recommended_for_state_sync is False + + # Test with minimal configuration (using default port) + minimal_config = make_config( + type="starrocks", + host="starrocks-fe", + user="starrocks_user", + password="starrocks_pswd", + check_import=False, + ) + assert isinstance(minimal_config, StarRocksConnectionConfig) + assert minimal_config.port == 9030 # Default StarRocks FE port + assert minimal_config.host == "starrocks-fe" + assert minimal_config.user == "starrocks_user" + + # Test with additional MySQL-compatible options + advanced_config = make_config( + type="starrocks", + host="starrocks-fe", + user="admin", + password="admin123", + port=9030, + database="testdb", + charset="utf8mb4", + ssl_disabled=True, + concurrent_tasks=10, + check_import=False, + ) + assert isinstance(advanced_config, StarRocksConnectionConfig) + assert advanced_config.charset == "utf8mb4" + assert advanced_config.ssl_disabled is True + assert advanced_config.concurrent_tasks == 10