elementary-data · GuyEshdat · Aug 11, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/.github/workflows/test-all-warehouses.yml b/.github/workflows/test-all-warehouses.yml
@@ -49,6 +49,7 @@ jobs:
             athena,
             trino,
             clickhouse,
+            dremio,
           ]
         include:
           # If we're not running on a specific dbt version, then always add postgres on 1.8.0

diff --git a/.github/workflows/test-warehouse.yml b/.github/workflows/test-warehouse.yml
@@ -17,6 +17,7 @@ on:
           - athena
           - trino
           - clickhouse
+          - dremio
       elementary-ref:
         type: string
         required: false
@@ -88,6 +89,11 @@ jobs:
         working-directory: ${{ env.TESTS_DIR }}
         run: docker compose up -d clickhouse
 
+      - name: Start Dremio
+        if: inputs.warehouse-type == 'dremio'
+        working-directory: ${{ env.TESTS_DIR }}
+        run: docker compose -f docker-compose-dremio.yml up -d
+
       - name: Setup Python
         uses: actions/setup-python@v4
         with:

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 target/
 dbt_packages/
+dbt_internal_packages/
 logs/
 scripts/
 

diff --git a/integration_tests/dbt_project/macros/generate_schema_name.sql b/integration_tests/dbt_project/macros/generate_schema_name.sql
@@ -1,13 +1,32 @@
 {% macro generate_schema_name(custom_schema_name, node) -%}
-    {%- set schema_name = target.schema -%}
-    {% if custom_schema_name %}
-        {% set schema_name = "{}_{}".format(schema_name, custom_schema_name) %}
-    {% endif %}
-
+    {% set schema_name = get_default_schema_name(custom_schema_name, node) %}
     {% set schema_name_suffix_by_var = var('schema_name_suffix', '') %}
     {% if schema_name_suffix_by_var %}
         {% set schema_name = schema_name + schema_name_suffix_by_var %}
     {% endif %}
 
     {% do return(schema_name) %}
 {%- endmacro %}
+
+{% macro get_default_schema_name(custom_schema_name, node) -%}
+    {% do return(adapter.dispatch('get_default_schema_name', 'elementary_tests')(custom_schema_name, node)) %}
+{% endmacro %}
+
+{% macro default__get_default_schema_name(custom_schema_name, node) -%}
+    {%- set schema_name = target.schema -%}
+    {% if custom_schema_name %}
+        {% set schema_name = "{}_{}".format(schema_name, custom_schema_name) %}
+    {% endif %}
+    {% do return(schema_name) %}
+{%- endmacro %}
+
+{% macro dremio__get_default_schema_name(custom_schema_name, node) -%}
+    {%- set default_schema = target.schema if not is_datalake_node(node) else target.root_path -%}
+    {%- if not custom_schema_name -%}
+        {% do return(default_schema) %}
+    {%- elif default_schema == 'no_schema' -%}
+        {% do return(custom_schema_name) %}
+    {%- else -%}
+        {% do return("{}_{}".format(default_schema, custom_schema_name)) %}
+    {%- endif -%}
+{%- endmacro %}
diff --git a/integration_tests/dbt_project/models/one.sql b/integration_tests/dbt_project/models/one.sql
@@ -6,4 +6,4 @@
     )
 }}
 
-SELECT 1 AS one
+SELECT 1 AS {{ elementary.escape_reserved_keywords('one') }}
diff --git a/integration_tests/docker-compose-dremio.yml b/integration_tests/docker-compose-dremio.yml
@@ -0,0 +1,85 @@
+services:
+  # Nessie Catalog Server Using In-Memory Store
+  nessie:
+    image: projectnessie/nessie:latest
+    container_name: catalog
+    networks:
+      - dremio-lakehouse
+    ports:
+      - 19120:19120
+
+  # Minio Storage Server
+  minio:
+    image: minio/minio:latest
+    container_name: storage
+    environment:
+      - MINIO_ROOT_USER=admin
+      - MINIO_ROOT_PASSWORD=password
+      - MINIO_DOMAIN=storage
+      - MINIO_REGION_NAME=us-east-1
+      - MINIO_REGION=us-east-1
+    networks:
+      - dremio-lakehouse
+    ports:
+      - 9001:9001
+      - 9000:9000
+    command: ["server", "/data", "--console-address", ":9001"]
+    volumes:
+      - minio_data:/data
+
+  minio-setup:
+    image: minio/mc
+    container_name: minio-setup
+    depends_on:
+      - minio
+    entrypoint: >
+      /bin/sh -c "
+      until (echo > /dev/tcp/minio/9000) >/dev/null 2>&1; do
+        echo 'Waiting for MinIO...';
+        sleep 2;
+      done;
+      mc alias set myminio http://minio:9000 admin password;
+      mc mb myminio/datalake;
+      mc ls myminio;
+      "
+    networks:
+      - dremio-lakehouse
+
+  # Dremio
+  dremio:
+    image: dremio/dremio-oss:latest
+    platform: linux/amd64
+    ports:
+      - 9047:9047
+      - 31010:31010
+      - 32010:32010
+      - 45678:45678
+    container_name: dremio
+    environment:
+      - DREMIO_JAVA_SERVER_EXTRA_OPTS=-Dpaths.dist=file:///opt/dremio/data/dist -Ddebug.addDefaultUser=true
+      - SERVICES_COORDINATOR_ENABLED=true
+      - SERVICES_EXECUTOR_ENABLED=true
+    networks:
+      - dremio-lakehouse
+    volumes:
+      - dremio_data:/opt/dremio/data:rw
+    # Workaround for permission issues in podman
+    user: "0"
+
+  dremio-setup:
+    image: alpine:latest
+    container_name: dremio-setup
+    depends_on:
+      - dremio
+    volumes:
+      - ./docker/dremio/dremio-setup.sh:/dremio-setup.sh
+    command: sh /dremio-setup.sh
+    networks:
+      - dremio-lakehouse
+
+networks:
+  dremio-lakehouse:
+
+volumes:
+  dremio_data:
+  minio_data:
diff --git a/integration_tests/docker/dremio/dremio-setup.sh b/integration_tests/docker/dremio/dremio-setup.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Install required tools
+apk add --no-cache curl jq
+
+# Wait for Dremio to be ready
+until curl -s http://dremio:9047; do
+  echo "Waiting for Dremio..."
+  sleep 5
+done
+
+echo "Dremio is up. Proceeding with configuration..."
+
+# Log in to Dremio to get the auth token
+AUTH_TOKEN=$(curl -s -X POST "http://dremio:9047/apiv2/login" \
+  -H "Content-Type: application/json" \
+  --data "{\"userName\":\"dremio\", \"password\":\"dremio123\"}" | jq -r .token)
+
+# Check if AUTH_TOKEN is not empty
+if [ -z "$AUTH_TOKEN" ]; then
+  echo "Failed to obtain Dremio auth token"
+  exit 1
+fi
+
+echo "Obtained Dremio auth token"
+
+# Create the S3 source in Dremio
+curl -s -X PUT "http://dremio:9047/apiv2/source/S3Source" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: _dremio$AUTH_TOKEN" \
+  --data "{\"name\":\"S3Source\",\"config\":{\"credentialType\":\"ACCESS_KEY\",\"accessKey\":\"admin\",\"accessSecret\":\"password\",\"secure\":false,\"externalBucketList\":[],\"enableAsync\":true,\"enableFileStatusCheck\":true,\"rootPath\":\"/\",\"defaultCtasFormat\":\"ICEBERG\",\"propertyList\":[{\"name\":\"fs.s3a.path.style.access\",\"value\":\"true\"},{\"name\":\"fs.s3a.endpoint\",\"value\":\"minio:9000\"},{\"name\":\"dremio.s3.compat\",\"value\":\"true\"}],\"whitelistedBuckets\":[],\"isCachingEnabled\":false,\"maxCacheSpacePct\":100},\"type\":\"S3\",\"metadataPolicy\":{\"deleteUnavailableDatasets\":true,\"autoPromoteDatasets\":false,\"namesRefreshMillis\":3600000,\"datasetDefinitionRefreshAfterMillis\":3600000,\"datasetDefinitionExpireAfterMillis\":10800000,\"authTTLMillis\":86400000,\"updateMode\":\"PREFETCH_QUERIED\"}}"
+
+echo "S3 Source created in Dremio"
diff --git a/integration_tests/tests/data_seeder.py b/integration_tests/tests/data_seeder.py
@@ -1,6 +1,7 @@
 import csv
+from contextlib import contextmanager
 from pathlib import Path
-from typing import List
+from typing import Generator, List
 
 from elementary.clients.dbt.base_dbt_runner import BaseDbtRunner
 from logger import get_logger
@@ -18,7 +19,8 @@ def __init__(
         self.dbt_project_path = dbt_project_path
         self.seeds_dir_path = seeds_dir_path
 
-    def seed(self, data: List[dict], table_name: str):
+    @contextmanager
+    def seed(self, data: List[dict], table_name: str) -> Generator[None, None, None]:
         seed_path = self.seeds_dir_path.joinpath(f"{table_name}.csv")
         try:
             with seed_path.open("w") as seed_file:
@@ -28,5 +30,7 @@ def seed(self, data: List[dict], table_name: str):
                 writer.writerows(data)
                 seed_file.flush()
                 self.dbt_runner.seed(select=str(relative_seed_path), full_refresh=True)
+
+                yield
         finally:
             seed_path.unlink()
diff --git a/integration_tests/tests/dbt_project.py b/integration_tests/tests/dbt_project.py
@@ -3,10 +3,11 @@
 from contextlib import contextmanager, nullcontext
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Dict, List, Literal, Optional, Union, overload
+from typing import Any, Dict, Generator, List, Literal, Optional, Union, overload
 from uuid import uuid4
 
 from data_seeder import DbtDataSeeder
+from dbt_utils import get_database_and_schema_properties
 from elementary.clients.dbt.base_dbt_runner import BaseDbtRunner
 from elementary.clients.dbt.factory import create_dbt_runner
 from logger import get_logger
@@ -42,7 +43,7 @@ def get_dbt_runner(target: str, project_dir: str) -> BaseDbtRunner:
 class DbtProject:
     def __init__(self, target: str, project_dir: str):
         self.dbt_runner = get_dbt_runner(target, project_dir)
-
+        self.target = target
         self.project_dir_path = Path(project_dir)
         self.models_dir_path = self.project_dir_path / "models"
         self.tmp_models_dir_path = self.models_dir_path / "tmp"
@@ -187,12 +188,16 @@ def test(
                 test_id, materialization
             )
         else:
+            database_property, schema_property = get_database_and_schema_properties(
+                self.target
+            )
             props_yaml = {
                 "version": 2,
                 "sources": [
                     {
                         "name": "test_data",
-                        "schema": f"{{{{ target.schema }}}}{SCHEMA_NAME_SUFFIX}",
+                        "schema": f"{{{{ target.{schema_property} }}}}{SCHEMA_NAME_SUFFIX}",
+                        "database": f"{{{{ target.{database_property} }}}}",
                         "tables": [table_yaml],
                     }
                 ],
@@ -230,9 +235,19 @@ def test(
             return [test_result] if multiple_results else test_result
 
     def seed(self, data: List[dict], table_name: str):
-        return DbtDataSeeder(
+        with DbtDataSeeder(
+            self.dbt_runner, self.project_dir_path, self.seeds_dir_path
+        ).seed(data, table_name):
+            return
+
+    @contextmanager
+    def seed_context(
+        self, data: List[dict], table_name: str
+    ) -> Generator[None, None, None]:
+        with DbtDataSeeder(
             self.dbt_runner, self.project_dir_path, self.seeds_dir_path
-        ).seed(data, table_name)
+        ).seed(data, table_name):
+            yield
 
     @contextmanager
     def create_temp_model_for_existing_table(

diff --git a/integration_tests/tests/dbt_utils.py b/integration_tests/tests/dbt_utils.py
@@ -0,0 +1,6 @@
+def get_database_and_schema_properties(target: str, is_view: bool = False):
+    if target == "dremio" and not is_view:
+        return "datalake", "root_path"
+    elif target == "clickhouse":
+        return "schema", "schema"
+    return "database", "schema"
diff --git a/integration_tests/tests/env.py b/integration_tests/tests/env.py
@@ -3,10 +3,13 @@
 
 class Environment:
     def __init__(self, target: str, project_dir: str):
+        self.target = target
         self.dbt_runner = dbt_project.get_dbt_runner(target, project_dir)
 
     def clear(self):
-        self.dbt_runner.run_operation("elementary_tests.clear_env")
+        # drop schema in dremio doesnt work, but we run the dremio tests with docker so its not really important to drop the schema
+        if self.target != "dremio":
+            self.dbt_runner.run_operation("elementary_tests.clear_env")
 
     def init(self):
         self.dbt_runner.run(selector="init")

diff --git a/integration_tests/tests/test_anomalies_backfill_logic.py b/integration_tests/tests/test_anomalies_backfill_logic.py
@@ -19,12 +19,12 @@
             bucket_start,
             bucket_end,
             metric_value,
-            row_number() over (partition by id order by updated_at desc) as row_number
+            row_number() over (partition by id order by updated_at desc) as row_num
         from {{{{ ref("data_monitoring_metrics") }}}}
         where metric_name = 'row_count' and lower(full_table_name) like '%{test_id}'
     )
     select bucket_start, bucket_end, metric_value from metrics_ordered
-    where row_number = 1
+    where row_num = 1
 """
 
 # This returns data points used in the latest anomaly test

diff --git a/integration_tests/tests/test_dbt_artifacts/test_artifacts.py b/integration_tests/tests/test_dbt_artifacts/test_artifacts.py
@@ -3,6 +3,7 @@
 import pytest
 from dbt_flags import set_flags
 from dbt_project import DbtProject
+from dbt_utils import get_database_and_schema_properties
 
 TEST_MODEL = "one"
 
@@ -95,17 +96,25 @@ def test_metrics_anomaly_score(dbt_project: DbtProject):
 
 @pytest.mark.requires_dbt_version("1.8.0")
 def test_source_freshness_results(test_id: str, dbt_project: DbtProject):
+    database_property, schema_property = get_database_and_schema_properties(
+        dbt_project.target
+    )
+    loaded_at_field = (
+        '"UPDATE_TIME"::timestamp'
+        if dbt_project.target != "dremio"
+        else "TO_TIMESTAMP(SUBSTRING(UPDATE_TIME, 0, 23), 'YYYY-MM-DD HH24:MI:SS.FFF')"
+    )
     source_config = {
         "version": 2,
         "sources": [
             {
                 "name": "test_source",
-                "database": "{{target.database if target.type != 'clickhouse' else target.schema}}",
-                "schema": "{{target.schema}}",
+                "database": f"{{{{ target.{database_property} }}}}",
+                "schema": f"{{{{ target.{schema_property} }}}}",
                 "tables": [
                     {
                         "name": test_id,
-                        "loaded_at_field": '"UPDATE_TIME"::timestamp',
+                        "loaded_at_field": loaded_at_field,
                         "freshness": {
                             "warn_after": {
                                 "count": 1,

diff --git a/integration_tests/tests/test_dimension_anomalies.py b/integration_tests/tests/test_dimension_anomalies.py
@@ -191,7 +191,7 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results(
     test_args = {
         "timestamp_column": TIMESTAMP_COLUMN,
         "dimensions": ["superhero"],
-        "exclude_final_results": "value > 15",
+        "exclude_final_results": '{{ elementary.escape_reserved_keywords("value") }} > 15',
     }
     test_result = dbt_project.test(test_id, DBT_TEST_NAME, test_args, data=data)
     assert test_result["status"] == "fail"
@@ -200,7 +200,7 @@ def test_dimension_anomalies_with_timestamp_exclude_final_results(
     test_args = {
         "timestamp_column": TIMESTAMP_COLUMN,
         "dimensions": ["superhero"],
-        "exclude_final_results": "average > 3",
+        "exclude_final_results": '{{ elementary.escape_reserved_keywords("average") }} > 3',
     }
     test_result = dbt_project.test(test_id, DBT_TEST_NAME, test_args, data=data)
     assert test_result["status"] == "fail"

diff --git a/integration_tests/tests/test_disable_elementary.py b/integration_tests/tests/test_disable_elementary.py
@@ -1,6 +1,6 @@
 from dbt_project import DbtProject
 
-COLUMN_NAME = "value"
+COLUMN_NAME = "some_column"
 
 
 def test_running_dbt_tests_without_elementary(test_id: str, dbt_project: DbtProject):

diff --git a/integration_tests/tests/test_failed_row_count.py b/integration_tests/tests/test_failed_row_count.py
@@ -1,7 +1,7 @@
 import pytest
 from dbt_project import DbtProject
 
-COLUMN_NAME = "value"
+COLUMN_NAME = "some_column"
 
 
 # Failed row count currently not supported on ClickHouse
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,6 +49,7 @@ jobs: @@
                 athena,
                 trino,
                 clickhouse,
+                dremio,
               ]
             include:
               # If we're not running on a specific dbt version, then always add postgres on 1.8.0
@@ Expand Down @@