-
Notifications
You must be signed in to change notification settings - Fork 121
Ele 4874 dremio #838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ele 4874 dremio #838
Changes from all commits
6931918
ecfa917
fd066cc
a8a4374
c002b47
90fe8db
6d4eda7
95948c3
0412bae
3e251cc
11976a2
9a63dba
1b63275
a9017b4
c56ed1e
da8e5f2
b523e08
79d8ae1
c4e6494
9d81741
489a724
d346398
bce218f
83489c2
4f4603e
b6fb5fc
7d6bdbf
cec4698
c5c689c
65c39fa
73c88ed
e616d80
cf41fca
62ccff6
dc51698
6a3a7f4
c825966
2374288
4c537c3
9b2fddf
b66e737
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| target/ | ||
| dbt_packages/ | ||
| dbt_internal_packages/ | ||
| logs/ | ||
| scripts/ | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,32 @@ | ||
| {% macro generate_schema_name(custom_schema_name, node) -%} | ||
| {%- set schema_name = target.schema -%} | ||
| {% if custom_schema_name %} | ||
| {% set schema_name = "{}_{}".format(schema_name, custom_schema_name) %} | ||
| {% endif %} | ||
|
|
||
| {% set schema_name = get_default_schema_name(custom_schema_name, node) %} | ||
| {% set schema_name_suffix_by_var = var('schema_name_suffix', '') %} | ||
| {% if schema_name_suffix_by_var %} | ||
| {% set schema_name = schema_name + schema_name_suffix_by_var %} | ||
| {% endif %} | ||
|
|
||
| {% do return(schema_name) %} | ||
| {%- endmacro %} | ||
|
|
||
| {% macro get_default_schema_name(custom_schema_name, node) -%} | ||
| {% do return(adapter.dispatch('get_default_schema_name', 'elementary_tests')(custom_schema_name, node)) %} | ||
| {% endmacro %} | ||
|
|
||
| {% macro default__get_default_schema_name(custom_schema_name, node) -%} | ||
| {%- set schema_name = target.schema -%} | ||
| {% if custom_schema_name %} | ||
| {% set schema_name = "{}_{}".format(schema_name, custom_schema_name) %} | ||
| {% endif %} | ||
| {% do return(schema_name) %} | ||
| {%- endmacro %} | ||
|
|
||
| {% macro dremio__get_default_schema_name(custom_schema_name, node) -%} | ||
| {%- set default_schema = target.schema if not is_datalake_node(node) else target.root_path -%} | ||
| {%- if not custom_schema_name -%} | ||
| {% do return(default_schema) %} | ||
| {%- elif default_schema == 'no_schema' -%} | ||
| {% do return(custom_schema_name) %} | ||
| {%- else -%} | ||
| {% do return("{}_{}".format(default_schema, custom_schema_name)) %} | ||
| {%- endif -%} | ||
| {%- endmacro %} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,4 +6,4 @@ | |
| ) | ||
| }} | ||
|
|
||
| SELECT 1 AS one | ||
| SELECT 1 AS {{ elementary.escape_reserved_keywords('one') }} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| services: | ||
| # Nessie Catalog Server Using In-Memory Store | ||
| nessie: | ||
| image: projectnessie/nessie:latest | ||
| container_name: catalog | ||
| networks: | ||
| - dremio-lakehouse | ||
| ports: | ||
| - 19120:19120 | ||
|
|
||
| # Minio Storage Server | ||
| minio: | ||
| image: minio/minio:latest | ||
| container_name: storage | ||
| environment: | ||
| - MINIO_ROOT_USER=admin | ||
| - MINIO_ROOT_PASSWORD=password | ||
| - MINIO_DOMAIN=storage | ||
| - MINIO_REGION_NAME=us-east-1 | ||
| - MINIO_REGION=us-east-1 | ||
| networks: | ||
| - dremio-lakehouse | ||
| ports: | ||
| - 9001:9001 | ||
| - 9000:9000 | ||
| command: ["server", "/data", "--console-address", ":9001"] | ||
| volumes: | ||
| - minio_data:/data | ||
|
|
||
| minio-setup: | ||
| image: minio/mc | ||
| container_name: minio-setup | ||
| depends_on: | ||
| - minio | ||
| entrypoint: > | ||
| /bin/sh -c " | ||
| until (echo > /dev/tcp/minio/9000) >/dev/null 2>&1; do | ||
| echo 'Waiting for MinIO...'; | ||
| sleep 2; | ||
| done; | ||
| mc alias set myminio http://minio:9000 admin password; | ||
| mc mb myminio/datalake; | ||
| mc ls myminio; | ||
| " | ||
| networks: | ||
| - dremio-lakehouse | ||
|
|
||
| # Dremio | ||
| dremio: | ||
| image: dremio/dremio-oss:latest | ||
| platform: linux/amd64 | ||
| ports: | ||
| - 9047:9047 | ||
| - 31010:31010 | ||
| - 32010:32010 | ||
| - 45678:45678 | ||
| container_name: dremio | ||
| environment: | ||
| - DREMIO_JAVA_SERVER_EXTRA_OPTS=-Dpaths.dist=file:///opt/dremio/data/dist -Ddebug.addDefaultUser=true | ||
| - SERVICES_COORDINATOR_ENABLED=true | ||
| - SERVICES_EXECUTOR_ENABLED=true | ||
| networks: | ||
| - dremio-lakehouse | ||
| volumes: | ||
| - dremio_data:/opt/dremio/data:rw | ||
| # Workaround for permission issues in podman | ||
| user: "0" | ||
|
|
||
| dremio-setup: | ||
| image: alpine:latest | ||
| container_name: dremio-setup | ||
| depends_on: | ||
| - dremio | ||
| volumes: | ||
| - ./docker/dremio/dremio-setup.sh:/dremio-setup.sh | ||
| command: sh /dremio-setup.sh | ||
| networks: | ||
| - dremio-lakehouse | ||
|
|
||
| networks: | ||
| dremio-lakehouse: | ||
|
|
||
| volumes: | ||
| dremio_data: | ||
| minio_data: | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| #!/bin/sh | ||
|
|
||
| # Install required tools | ||
| apk add --no-cache curl jq | ||
|
|
||
| # Wait for Dremio to be ready | ||
| until curl -s http://dremio:9047; do | ||
| echo "Waiting for Dremio..." | ||
| sleep 5 | ||
| done | ||
|
|
||
ofek1weiss marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| echo "Dremio is up. Proceeding with configuration..." | ||
|
|
||
| # Log in to Dremio to get the auth token | ||
| AUTH_TOKEN=$(curl -s -X POST "http://dremio:9047/apiv2/login" \ | ||
| -H "Content-Type: application/json" \ | ||
| --data "{\"userName\":\"dremio\", \"password\":\"dremio123\"}" | jq -r .token) | ||
|
|
||
| # Check if AUTH_TOKEN is not empty | ||
| if [ -z "$AUTH_TOKEN" ]; then | ||
| echo "Failed to obtain Dremio auth token" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Obtained Dremio auth token" | ||
|
|
||
| # Create the S3 source in Dremio | ||
| curl -s -X PUT "http://dremio:9047/apiv2/source/S3Source" \ | ||
| -H "Content-Type: application/json" \ | ||
| -H "Authorization: _dremio$AUTH_TOKEN" \ | ||
| --data "{\"name\":\"S3Source\",\"config\":{\"credentialType\":\"ACCESS_KEY\",\"accessKey\":\"admin\",\"accessSecret\":\"password\",\"secure\":false,\"externalBucketList\":[],\"enableAsync\":true,\"enableFileStatusCheck\":true,\"rootPath\":\"/\",\"defaultCtasFormat\":\"ICEBERG\",\"propertyList\":[{\"name\":\"fs.s3a.path.style.access\",\"value\":\"true\"},{\"name\":\"fs.s3a.endpoint\",\"value\":\"minio:9000\"},{\"name\":\"dremio.s3.compat\",\"value\":\"true\"}],\"whitelistedBuckets\":[],\"isCachingEnabled\":false,\"maxCacheSpacePct\":100},\"type\":\"S3\",\"metadataPolicy\":{\"deleteUnavailableDatasets\":true,\"autoPromoteDatasets\":false,\"namesRefreshMillis\":3600000,\"datasetDefinitionRefreshAfterMillis\":3600000,\"datasetDefinitionExpireAfterMillis\":10800000,\"authTTLMillis\":86400000,\"updateMode\":\"PREFETCH_QUERIED\"}}" | ||
|
|
||
| echo "S3 Source created in Dremio" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,10 +3,11 @@ | |
| from contextlib import contextmanager, nullcontext | ||
| from pathlib import Path | ||
| from tempfile import NamedTemporaryFile | ||
| from typing import Any, Dict, List, Literal, Optional, Union, overload | ||
| from typing import Any, Dict, Generator, List, Literal, Optional, Union, overload | ||
| from uuid import uuid4 | ||
|
|
||
| from data_seeder import DbtDataSeeder | ||
| from dbt_utils import get_database_and_schema_properties | ||
| from elementary.clients.dbt.base_dbt_runner import BaseDbtRunner | ||
| from elementary.clients.dbt.factory import create_dbt_runner | ||
| from logger import get_logger | ||
|
|
@@ -42,7 +43,7 @@ def get_dbt_runner(target: str, project_dir: str) -> BaseDbtRunner: | |
| class DbtProject: | ||
| def __init__(self, target: str, project_dir: str): | ||
| self.dbt_runner = get_dbt_runner(target, project_dir) | ||
|
|
||
| self.target = target | ||
| self.project_dir_path = Path(project_dir) | ||
| self.models_dir_path = self.project_dir_path / "models" | ||
| self.tmp_models_dir_path = self.models_dir_path / "tmp" | ||
|
|
@@ -187,12 +188,16 @@ def test( | |
| test_id, materialization | ||
| ) | ||
| else: | ||
| database_property, schema_property = get_database_and_schema_properties( | ||
| self.target | ||
| ) | ||
| props_yaml = { | ||
| "version": 2, | ||
| "sources": [ | ||
| { | ||
| "name": "test_data", | ||
| "schema": f"{{{{ target.schema }}}}{SCHEMA_NAME_SUFFIX}", | ||
| "schema": f"{{{{ target.{schema_property} }}}}{SCHEMA_NAME_SUFFIX}", | ||
| "database": f"{{{{ target.{database_property} }}}}", | ||
| "tables": [table_yaml], | ||
|
Comment on lines
+191
to
201
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainDynamic database/schema properties in sources: good direction; please verify across adapters The switch to
🏁 Script executed: #!/bin/bash
# Validate that both database & schema in source YAML are expected for each warehouse in tests.
# Grep where props are generated and ensure mappings exist for all targets we test.
rg -n "get_database_and_schema_properties" integration_tests/tests
rg -n "def get_database_and_schema_properties" -A 30 integration_tests/testsLength of output: 1239 It looks like we need to update the property mapping for BigQuery (and any other adapters) rather than relying on the default Next steps:
Tags: 🤖 Prompt for AI Agents
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test pass for bigquery, your suggested change is unneeded There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @GuyEshdat, thank you for the clarification! I appreciate you confirming that the BigQuery tests pass without the additional change. The default ✏️ Learnings added
|
||
| } | ||
| ], | ||
|
|
@@ -230,9 +235,19 @@ def test( | |
| return [test_result] if multiple_results else test_result | ||
|
|
||
| def seed(self, data: List[dict], table_name: str): | ||
| return DbtDataSeeder( | ||
| with DbtDataSeeder( | ||
| self.dbt_runner, self.project_dir_path, self.seeds_dir_path | ||
| ).seed(data, table_name): | ||
| return | ||
|
|
||
|
Comment on lines
+238
to
+242
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainBug: seed lifecycle ends before the test runs
Fix: Use Proposed structure inside # around current lines ~207-224
if data:
with self.seed_context(data, table_name), temp_table_ctx:
# write props and run test as today
with NamedTemporaryFile(
dir=self.tmp_models_dir_path,
prefix="integration_tests_",
suffix=".yaml",
) as props_file:
YAML().dump(props_yaml, props_file)
relative_props_path = Path(props_file.name).relative_to(self.project_dir_path)
test_process_success = self.dbt_runner.test(select=str(relative_props_path), vars=test_vars)
else:
with temp_table_ctx:
with NamedTemporaryFile(
dir=self.tmp_models_dir_path,
prefix="integration_tests_",
suffix=".yaml",
) as props_file:
YAML().dump(props_yaml, props_file)
relative_props_path = Path(props_file.name).relative_to(self.project_dir_path)
test_process_success = self.dbt_runner.test(select=str(relative_props_path), vars=test_vars)Optionally, we could also deprecate 🏁 Script executed: #!/bin/bash
# Find call sites still using `seed(` (non-context) so we can migrate them to `seed_context`.
rg -n --no-heading $'\\bseed\\s*\\(' integration_tests/tests | rg -v 'seed_context'Length of output: 1443 🏁 Script executed: #!/bin/bash
# Inspect DataSeeder.seed implementation
sed -n '1,200p' integration_tests/tests/data_seeder.py
# Inspect DbtProject.seed and surrounding context
sed -n '200,300p' integration_tests/tests/dbt_project.py
# Check if seed_context is defined in DbtProject
rg -n 'seed_context' integration_tests/tests/dbt_project.pyLength of output: 5242 Enforce seed lifecycle over test execution using seed_context The • In if data:
self.seed(data, table_name)
with temp_table_ctx:
… # run testwith: if data:
with self.seed_context(data, table_name), temp_table_ctx:
… # run test
else:
with temp_table_ctx:
… # run test• Update any standalone calls to with dbt_project.seed_context(data, table_name):
… # subsequent test or setup codeOptionally, deprecate or remove the 🤖 Prompt for AI Agents |
||
| @contextmanager | ||
| def seed_context( | ||
| self, data: List[dict], table_name: str | ||
| ) -> Generator[None, None, None]: | ||
| with DbtDataSeeder( | ||
| self.dbt_runner, self.project_dir_path, self.seeds_dir_path | ||
| ).seed(data, table_name) | ||
| ).seed(data, table_name): | ||
| yield | ||
|
|
||
| @contextmanager | ||
| def create_temp_model_for_existing_table( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| def get_database_and_schema_properties(target: str, is_view: bool = False): | ||
| if target == "dremio" and not is_view: | ||
| return "datalake", "root_path" | ||
| elif target == "clickhouse": | ||
| return "schema", "schema" | ||
| return "database", "schema" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import pytest | ||
| from dbt_flags import set_flags | ||
| from dbt_project import DbtProject | ||
| from dbt_utils import get_database_and_schema_properties | ||
|
|
||
| TEST_MODEL = "one" | ||
|
|
||
|
|
@@ -95,17 +96,25 @@ def test_metrics_anomaly_score(dbt_project: DbtProject): | |
|
|
||
| @pytest.mark.requires_dbt_version("1.8.0") | ||
| def test_source_freshness_results(test_id: str, dbt_project: DbtProject): | ||
| database_property, schema_property = get_database_and_schema_properties( | ||
| dbt_project.target | ||
| ) | ||
| loaded_at_field = ( | ||
| '"UPDATE_TIME"::timestamp' | ||
| if dbt_project.target != "dremio" | ||
| else "TO_TIMESTAMP(SUBSTRING(UPDATE_TIME, 0, 23), 'YYYY-MM-DD HH24:MI:SS.FFF')" | ||
| ) | ||
|
Comment on lines
+99
to
+106
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Fix Dremio loaded_at_field: ensure proper casting and 1-based substring SUBSTRING should start at 1 (not 0), and applying SUBSTRING to a TIMESTAMP needs CAST to VARCHAR. - loaded_at_field = (
- '"UPDATE_TIME"::timestamp'
- if dbt_project.target != "dremio"
- else "TO_TIMESTAMP(SUBSTRING(UPDATE_TIME, 0, 23), 'YYYY-MM-DD HH24:MI:SS.FFF')"
- )
+ loaded_at_field = (
+ '"UPDATE_TIME"::timestamp'
+ if dbt_project.target != "dremio"
+ else "TO_TIMESTAMP(SUBSTR(CAST(UPDATE_TIME AS VARCHAR), 1, 23), 'YYYY-MM-DD HH24:MI:SS.FFF')"
+ )This keeps the fractional seconds to millisecond precision and avoids type errors. Also applies to: 117-117 🤖 Prompt for AI Agents |
||
| source_config = { | ||
| "version": 2, | ||
| "sources": [ | ||
| { | ||
| "name": "test_source", | ||
| "database": "{{target.database if target.type != 'clickhouse' else target.schema}}", | ||
| "schema": "{{target.schema}}", | ||
| "database": f"{{{{ target.{database_property} }}}}", | ||
| "schema": f"{{{{ target.{schema_property} }}}}", | ||
| "tables": [ | ||
| { | ||
| "name": test_id, | ||
| "loaded_at_field": '"UPDATE_TIME"::timestamp', | ||
| "loaded_at_field": loaded_at_field, | ||
| "freshness": { | ||
| "warn_after": { | ||
| "count": 1, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.