diff --git a/.github/actions/setup-base/action.yaml b/.github/actions/setup-base/action.yaml index ecfb9d7..f4e8c22 100644 --- a/.github/actions/setup-base/action.yaml +++ b/.github/actions/setup-base/action.yaml @@ -16,4 +16,4 @@ runs: cache: 'pip' - name: Install dependencies shell: bash - run: make install \ No newline at end of file + run: make install-dev \ No newline at end of file diff --git a/Makefile b/Makefile index abe2e07..40ace39 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -install: +install-dev: pip install -e ".[dev]" test: diff --git a/README.md b/README.md index 47e1785..bc2250f 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ The properties are as follows: #### time_column -This is the column in the dataset that contains the timestamp. It follows the [same syntax](https://sqlmesh.readthedocs.io/en/latest/concepts/models/model_kinds/#time-column) as upstream `INCREMENTAL_BY_TIME_RANGE`. +This is the column in the dataset that contains the timestamp. It follows the [same syntax](https://sqlmesh.readthedocs.io/en/latest/concepts/models/model_kinds/#time-column) as upstream `INCREMENTAL_BY_TIME_RANGE` and also the same rules with regards to respecting the project [time_column_format](https://sqlmesh.readthedocs.io/en/stable/reference/configuration/#environments) property and being automatically added to the model `partition_by` field list. #### primary_key @@ -73,4 +73,23 @@ This is the column or combination of columns that uniquely identifies a record. The columns listed here are used in the `ON` clause of the SQL Merge to join the source and target datasets. -Note that the `time_column` is **not** automatically injected into this list (to allow timestamps on records to be updated), so if the `time_column` does actually form part of the primary key in your dataset then it needs to be added here. \ No newline at end of file +Note that the `time_column` is **not** automatically injected into this list (to allow timestamps on records to be updated), so if the `time_column` does actually form part of the primary key in your dataset then it needs to be added here. + +#### partition_by_time_column + +By default, the `time_column` will get added to the list of fields in the model `partitioned_by` property, causing it to be included in the table partition key. This may be undesirable in some circumstances. + +To opt out of this behaviour, you can set `partition_by_time_column = false` like so: + +``` +MODEL ( + name my_db.my_model, + kind CUSTOM ( + materialization 'non_idempotent_incremental_by_time_range', + materialization_properties ( + ..., + partition_by_time_column = false + ) + ) +); +``` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a1f1021..c1382a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Utilities for SQLMesh" readme = "README.md" requires-python = ">= 3.9" dependencies = [ - "sqlmesh>=0.160.0" + "sqlmesh>=0.163.0" ] [project.optional-dependencies] diff --git a/sqlmesh_utils/materializations/non_idempotent_incremental_by_time_range.py b/sqlmesh_utils/materializations/non_idempotent_incremental_by_time_range.py index f7d82ad..1a7bfa7 100644 --- a/sqlmesh_utils/materializations/non_idempotent_incremental_by_time_range.py +++ b/sqlmesh_utils/materializations/non_idempotent_incremental_by_time_range.py @@ -7,7 +7,7 @@ from sqlmesh.utils.date import make_inclusive from sqlmesh.utils.errors import ConfigError, SQLMeshError from pydantic import model_validator -from sqlmesh.utils.pydantic import list_of_fields_validator +from sqlmesh.utils.pydantic import list_of_fields_validator, bool_validator from sqlmesh.utils.date import TimeLike from sqlmesh.core.engine_adapter.base import MERGE_SOURCE_ALIAS, MERGE_TARGET_ALIAS from sqlmesh import CustomKind @@ -22,6 +22,8 @@ class NonIdempotentIncrementalByTimeRangeKind(CustomKind): # this is deliberately primary_key instead of unique_key to direct away from INCREMENTAL_BY_UNIQUE_KEY _primary_key: t.List[exp.Expression] + _partition_by_time_column: bool + @model_validator(mode="after") def _validate_model(self): self._time_column = TimeColumn.create( @@ -44,6 +46,10 @@ def _validate_model(self): "`primary_key` cannot be just the time_column. Please list the columns that when combined, uniquely identify a row" ) + self._partition_by_time_column = bool_validator( + self.materialization_properties.get("partition_by_time_column", True) + ) + return self @property @@ -54,6 +60,10 @@ def time_column(self) -> TimeColumn: def primary_key(self) -> t.List[exp.Expression]: return self._primary_key + @property + def partition_by_time_column(self) -> bool: + return self._partition_by_time_column + class NonIdempotentIncrementalByTimeRangeMaterialization( CustomMaterialization[NonIdempotentIncrementalByTimeRangeKind] diff --git a/tests/materializations/test_non_idempotent_incremental_by_time_range.py b/tests/materializations/test_non_idempotent_incremental_by_time_range.py index 0da880a..9d37c38 100644 --- a/tests/materializations/test_non_idempotent_incremental_by_time_range.py +++ b/tests/materializations/test_non_idempotent_incremental_by_time_range.py @@ -51,6 +51,9 @@ def test_kind(make_model: ModelMaker): model = make_model(["time_column = ds", "primary_key = (id, ds)"]) assert isinstance(model.kind, NonIdempotentIncrementalByTimeRangeKind) + assert model.partitioned_by == [exp.to_column("ds", quoted=True)] + assert model.kind.partition_by_time_column + assert model.kind.time_column.column == exp.to_column("ds", quoted=True) assert model.kind.primary_key == [ exp.to_column("id", quoted=True), @@ -157,3 +160,13 @@ def test_append(make_model: ModelMaker, make_mocked_engine_adapter: MockedEngine dialect=adapter.dialect, ).sql(dialect=adapter.dialect) ] + + +def test_partition_by_time_column_opt_out(make_model: ModelMaker): + model = make_model( + ["time_column = ds", "primary_key = name", "partition_by_time_column = false"] + ) + + assert isinstance(model.kind, NonIdempotentIncrementalByTimeRangeKind) + assert not model.kind.partition_by_time_column + assert model.partitioned_by == []