Skip to content

Commit 8dbdf4a

Browse files
Support delta and iceberg via builder pattern (#146)
1 parent 570cec1 commit 8dbdf4a

35 files changed

+1200
-657
lines changed

.coveragerc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[run]
2+
omit = tests/*

.github/workflows/build.yaml

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,44 +16,48 @@ jobs:
1616
packages: read
1717
pull-requests: write
1818

19-
container:
20-
image: ghcr.io/sneaksanddata/spark:v5.0.3
21-
credentials:
22-
username: USERNAME
23-
password: ${{ secrets.GHCR_TOKEN }}
24-
options: -u root -w /opt/spark --mount type=tmpfs,destination=/home/spark
25-
2619
steps:
27-
- uses: actions/checkout@v4
20+
- uses: actions/checkout@v6
2821
with:
2922
fetch-depth: 0
3023

31-
- name: Install Poetry
32-
run: |
33-
set -e
34-
35-
curl -sSL https://install.python-poetry.org | python3 -
36-
- name: Install Dependencies
37-
run: |
38-
set -e
39-
40-
/github/home/.local/bin/poetry install
24+
- name: Build the Docker Compose stack
25+
run: docker compose up --quiet-pull -d
26+
27+
- name: Get Compose Logs
28+
if: always()
29+
run: docker compose logs
30+
31+
- uses: actions/setup-python@v5
32+
with:
33+
python-version: '3.11.x' # Version range or exact version of a Python version to use, using SemVer's version range syntax
34+
architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified
35+
36+
- name: Install Poetry and dependencies
37+
uses: SneaksAndData/github-actions/[email protected]
38+
with:
39+
install_extras: all
40+
4141
- name: Black
4242
shell: bash
4343
run: |
4444
set -e
45-
/github/home/.local/bin/poetry run black . --check --diff
45+
46+
poetry run black . --check --diff
4647
- name: Lint
4748
run: |
4849
set -e
4950
50-
find ./spark_utils -type f -name "*.py" | xargs /github/home/.local/bin/poetry run pylint
51+
find ./spark_utils -type f -name "*.py" | xargs poetry run pylint
52+
5153
- name: Unit test
5254
shell: bash
55+
env:
56+
PYSPARK_SUBMIT_ARGS: "--packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.0,org.apache.iceberg:iceberg-aws-bundle:1.10.0,io.delta:delta-spark_2.12:3.2.1 pyspark-shell"
5357
run: |
5458
set -euxo pipefail
5559
56-
/github/home/.local/bin/poetry run pytest ./test --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=term-missing:skip-covered | tee pytest-coverage.txt
60+
poetry run pytest --cov-config=.coveragerc --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=term-missing:skip-covered | tee pytest-coverage.txt
5761
- name: Publish Code Coverage
5862
uses: MishaKav/pytest-coverage-comment@main
5963
with:

.github/workflows/prepare_release.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
with:
1414
fetch-depth: 0
1515
- name: Create Release
16-
uses: SneaksAndData/github-actions/[email protected].9
16+
uses: SneaksAndData/github-actions/[email protected].11
1717
with:
18-
major_v: 1
19-
minor_v: 3
18+
major_v: 2
19+
minor_v: 0

.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# can either give multiple identifier separated by comma (,) or put this option
55
# multiple time (only on the command line, not in the configuration file where
66
# it should appear only once).
7-
disable=C0301,R0903,W0511,C0303,W0107,R0913,R0902
7+
disable=C0301,C0114,R0903,W0511,C0303,W0107,R0913,R0902
88

99
[DESIGN]
1010
max-args=10

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2022 Ecco Sneaks & Data
3+
Copyright (c) 2022-2026 Ecco Sneaks & Data
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

bootstrap-lk.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import requests
2+
3+
4+
def init_lk():
5+
# boostrap
6+
resp = requests.post(url="http://localhost:20001/management/v1/bootstrap", json={"accept-terms-of-use": True})
7+
if resp.status_code not in [200, 400]:
8+
resp.raise_for_status()
9+
10+
# create warehouse
11+
resp = requests.post(
12+
url="http://localhost:20001/management/v1/warehouse",
13+
json={
14+
"warehouse-name": "demo",
15+
"project-id": "00000000-0000-0000-0000-000000000000",
16+
"storage-profile": {
17+
"type": "s3",
18+
"bucket": "tmp",
19+
"key-prefix": "initial-warehouse",
20+
"assume-role-arn": None,
21+
"endpoint": "http://localhost:9000",
22+
"region": "us-east-1",
23+
"path-style-access": True,
24+
"flavor": "minio",
25+
"sts-enabled": False,
26+
},
27+
"storage-credential": {
28+
"type": "s3",
29+
"credential-type": "access-key",
30+
"aws-access-key-id": "minioadmin",
31+
"aws-secret-access-key": "minioadmin",
32+
},
33+
},
34+
)
35+
36+
if resp.status_code not in [409, 200]:
37+
resp.raise_for_status()
38+
39+
# add namespace
40+
warehouse_prefix = requests.get("http://localhost:20001/catalog/v1/config?warehouse=demo").json()["defaults"][
41+
"prefix"
42+
]
43+
ns_resp = requests.post(
44+
f"http://localhost:20001/catalog/v1/{warehouse_prefix}/namespaces", json={"namespace": ["test"]}
45+
)
46+
ns_resp.raise_for_status()
47+
48+
49+
if __name__ == "__main__":
50+
init_lk()

docker-compose.yaml

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
version: '3.3'
2+
3+
services:
4+
minio:
5+
container_name: minio
6+
hostname: minio-e2e
7+
image: quay.io/minio/minio
8+
network_mode: host
9+
restart: always
10+
command:
11+
- server
12+
- /data
13+
- "--console-address"
14+
- ":9001"
15+
healthcheck:
16+
test: [ "CMD", "curl", "-f", "http://localhost:9001" ]
17+
interval: 30s
18+
timeout: 10s
19+
retries: 5
20+
prepare_buckets:
21+
container_name: minio-setup
22+
image: quay.io/minio/minio
23+
network_mode: host
24+
depends_on:
25+
minio:
26+
condition: service_healthy
27+
entrypoint:
28+
- "/bin/sh"
29+
- "-c"
30+
- |
31+
mc alias set e2e "http://localhost:9000" minioadmin minioadmin
32+
mc admin info e2e
33+
mc mb --ignore-existing e2e/tmp \
34+
&& mc mb --ignore-existing e2e/iceberg
35+
# https://github.com/databricks/docker-spark-iceberg/blob/main/docker-compose.yml
36+
lakekeeper:
37+
image: quay.io/lakekeeper/catalog:v0.9.2
38+
network_mode: host
39+
environment:
40+
# - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure!
41+
- LAKEKEEPER__LISTEN_PORT=20001
42+
- LAKEKEEPER__METRICS_PORT=20002
43+
- LAKEKEEPER__BASE_URI=http://localhost:20001
44+
- ICEBERG_REST__BASE_URI=http://localhost:20001
45+
- LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:5432/postgres
46+
- LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:5432/postgres
47+
- RUST_LOG=trace,axum=trace,sqlx=trace,iceberg-catalog=trace
48+
command: [ "serve" ]
49+
healthcheck:
50+
test: [ "CMD", "/home/nonroot/lakekeeper", "healthcheck" ]
51+
interval: 1s
52+
timeout: 10s
53+
retries: 30
54+
depends_on:
55+
lakekeeper_migrate:
56+
condition: service_completed_successfully
57+
lakekeeper_db:
58+
condition: service_healthy
59+
minio:
60+
condition: service_healthy
61+
lakekeeper_migrate:
62+
image: quay.io/lakekeeper/catalog:v0.9.2
63+
network_mode: host
64+
environment:
65+
# - LAKEKEEPER__PG_ENCRYPTION_KEY=This-is-NOT-Secure!
66+
- LAKEKEEPER__PG_DATABASE_URL_READ=postgresql://postgres:postgres@localhost:5432/postgres
67+
- LAKEKEEPER__PG_DATABASE_URL_WRITE=postgresql://postgres:postgres@localhost:5432/postgres
68+
- RUST_LOG=info
69+
restart: "no"
70+
command: [ "migrate" ]
71+
depends_on:
72+
lakekeeper_db:
73+
condition: service_healthy
74+
lakekeeper_prepare:
75+
image: python:3.11-slim-bookworm
76+
network_mode: host
77+
environment:
78+
CATALOG_HOST: localhost
79+
CATALOG_WAREHOUSE: demo
80+
depends_on:
81+
lakekeeper:
82+
condition: service_healthy
83+
restart: "no"
84+
volumes:
85+
- ./bootstrap-lk.py:/bootstrap-lk.py
86+
command: [ "/bin/sh", "-c", "pip install requests && python /bootstrap-lk.py" ]
87+
lakekeeper_db:
88+
image: postgres:16.3-bullseye
89+
network_mode: host
90+
restart: always
91+
environment:
92+
- POSTGRES_USERNAME=postgres
93+
- POSTGRES_PASSWORD=postgres
94+
- POSTGRES_HOST_AUTH_METHOD=trust
95+
- POSTGRES_DB=postgres
96+
- PGHOST=localhost
97+
- PGPORT=5432
98+
healthcheck:
99+
test: [ "CMD-SHELL", "pg_isready -U postgres -p 5432 -d postgres" ]
100+
interval: 2s
101+
timeout: 30s
102+
retries: 8

0 commit comments

Comments
 (0)