diff --git a/.env.example b/.env.example index 8d41bc014d..acaf2262c5 100644 --- a/.env.example +++ b/.env.example @@ -16,6 +16,11 @@ SQLMESH_DUCKDB_LOCAL_PATH=/tmp/oso.duckdb DAGSTER_USE_LOCAL_SECRETS=True #DAGSTER_GCP_SECRETS_PREFIX=dagster +# OSO's python libraries are configured to use json logging by default but this +# can be annoying when viewing things locally. This will configure logs to be +# output in a more human-readable format. +OSO_ENABLE_JSON_LOGS=0 + ## Google Cloud setup # You will need to generate Google application credentials. # You can log in via `gcloud auth application-default login` diff --git a/apps/docs/docs/contribute-data/setup/index.md b/apps/docs/docs/contribute-data/setup/index.md index 99a1683177..b954b23312 100644 --- a/apps/docs/docs/contribute-data/setup/index.md +++ b/apps/docs/docs/contribute-data/setup/index.md @@ -215,3 +215,40 @@ Notice that after `-m` the code location's module path is specified. It is useful to note for newcomers that the `warehouse/` path in the repository is not considered a python module as it does not contain a `__init__.py` file and does not appear as a python module in the root `pyproject.toml` + +### Running dagster with sqlmesh locally + +This is mostly for the OSO team as most people should not need to run sqlmesh on +the dagster UI in a local fashion. It should be enough for anyone looking to add +models to run sqlmesh on it's own. The only reason to run sqlmesh locally is to +ensure that the dagster-sqlmesh integration is working as expected with our +particular pipeline. + +Some environment variables need to be set in your `.env`: + +```bash +# While not strictly necessary, you likely want the sqlmesh dagster asset +# caching enabled so restarting doesn't take so long. +DAGSTER_ASSET_CACHE_ENABLED=1 +DAGSTER_ASSET_CACHE_DIR=/path/to/some/cache/dir # change this +# You can set this number to anything reasonable for your testing use case +DAGSTER_ASSET_CACHE_DEFAULT_TTL_SECONDS=3600 +# `local` uses duckdb +# `local-trino` uses a locally deployed trino +# Suggestion is to use `local` as it's faster. This doc assumes duckdb. +DAGSTER_SQLMESH_GATEWAY=local +SQLMESH_TESTING_ENABLED=1 +OSO_ENABLE_JSON_LOGS=0 +``` + +Then you should run the sqlmesh local test setup to get your local sqlmesh +duckdb initialized with oso local seed data. + +```bash +uv run oso local sqlmesh-test --duckdb +``` + +Now it should be possible run sqlmesh and dagster locally. When materializing +sqlmesh assets, it might complain about some out of date dependencies. Since we +ran the local test setup, the data it's depending on should have been added by +the oso local seed setup. diff --git a/pyproject.toml b/pyproject.toml index 0821cb8090..eb2246f852 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ dependencies = [ "kr8s==0.20.9", "structlog>=25.4.0", "pandas-gbq>=0.29.2", - "dagster-sqlmesh>=0.19.0", + "dagster-sqlmesh>=0.20.0", "oso-core", "pyoso", "metrics-service" diff --git a/uv.lock b/uv.lock index 179dbbd20c..e9a53f3efb 100644 --- a/uv.lock +++ b/uv.lock @@ -1029,17 +1029,18 @@ wheels = [ [[package]] name = "dagster-sqlmesh" -version = "0.19.0" +version = "0.20.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dagster" }, { name = "pyarrow" }, + { name = "pydantic" }, { name = "pytest" }, { name = "sqlmesh" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/53/7d26939f25cabfaefc9a37e22114d3760e72eef3764c89054a0f3f7dfe72/dagster_sqlmesh-0.19.0.tar.gz", hash = "sha256:95b743c99ea08adb9aa3df3a46fc08c96297e9b070af80bdc8a2dab8bcfbf92c", size = 217724 } +sdist = { url = "https://files.pythonhosted.org/packages/c6/4c/96dfe9713190c8a4ca1a0208ef9cfbdc3af6ef60841ff54a37e129a2c4c5/dagster_sqlmesh-0.20.0.tar.gz", hash = "sha256:8db2c520258187b8a66ba075f30cbd3a06d4d669fbb7462b9b04cd81b75c0f93", size = 166508 } wheels = [ - { url = "https://files.pythonhosted.org/packages/bf/d3/087c3f97fdd83c42a708714fdba641e941ef0c0eaee03ebfeec6c20a80f1/dagster_sqlmesh-0.19.0-py3-none-any.whl", hash = "sha256:4ff76c643157a3e4ac96f7ae0475c4013b2b3e1e659976e670e506101aa52ab6", size = 33449 }, + { url = "https://files.pythonhosted.org/packages/43/0f/81ad0b2bd4ab4453ec52c06f11a143271e4e24c5c6949656b6f96a0fdebc/dagster_sqlmesh-0.20.0-py3-none-any.whl", hash = "sha256:7b6227dfc078f5acfeae05a86a7c65831b2bc46123f89d90519a4c4f9993dc24", size = 37578 }, ] [[package]] @@ -3746,7 +3747,7 @@ requires-dist = [ { name = "dagster-k8s", specifier = ">=0.24.6,<1.0.0" }, { name = "dagster-polars", specifier = ">=0.24.0,<1.0.0" }, { name = "dagster-postgres", specifier = ">=0.24.0,<1.0.0" }, - { name = "dagster-sqlmesh", specifier = ">=0.19.0" }, + { name = "dagster-sqlmesh", specifier = ">=0.20.0" }, { name = "dagster-webserver", specifier = ">=1.7.16,<2.0.0" }, { name = "dask", extras = ["distributed"], specifier = ">=2024.4.2,<2025.8.0" }, { name = "dask-kubernetes", specifier = ">=2024.4.2,<2025.8.0" }, diff --git a/warehouse/oso_dagster/assets/sqlmesh/sqlmesh.py b/warehouse/oso_dagster/assets/sqlmesh/sqlmesh.py index 40c540c584..8fa1eef909 100644 --- a/warehouse/oso_dagster/assets/sqlmesh/sqlmesh.py +++ b/warehouse/oso_dagster/assets/sqlmesh/sqlmesh.py @@ -194,8 +194,10 @@ def run_sqlmesh( config.allow_destructive_models ) - # If we specify a dev_environment, we will first plan it for safety - if dev_environment: + # If we specify a dev_environment, we will first plan it for + # safety. Restatements are ignored as they may end up duplicating + # work based on how restatement in planning works. + if dev_environment and not config.restate_models: context.log.info("Planning dev environment") all( sqlmesh.run( @@ -206,6 +208,7 @@ def run_sqlmesh( end=config.end, restate_models=restate_models, skip_run=True, + materializations_enabled=False, ) )