Skip to content

Commit 2dac9f5

Browse files
committed
feat: fully working demo lake
1 parent 78c76f5 commit 2dac9f5

File tree

5 files changed

+110
-9
lines changed

5 files changed

+110
-9
lines changed

README.md

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,61 @@ flowchart TD
7979

8080
4. **Verify the lakehouse**
8181

82+
Once you have a client up via 3 all setup should be automatic from the Docker container. Doing something like the following should get you some gene data loaded into your lake.
83+
8284
```sql
83-
INSTALL ducklake;
84-
INSTALL postgres;
85-
ATTACH 'ducklake:postgres:dbname=ducklake_catalog host=postgres user=ducklake password=ducklake' AS the_ducklake (DATA_PATH 's3://ducklake/lake/');
86-
USE the_ducklake;
87-
SELECT * FROM ducklake.schema;
85+
CREATE OR REPLACE TABLE gene AS
86+
SELECT *
87+
FROM read_csv_auto(
88+
'https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/non_alt_loci_set.txt',
89+
HEADER => TRUE,
90+
DELIM => '\t',
91+
SAMPLE_SIZE => 100000
92+
);
8893
```
8994

95+
Then `select * from gene limit 10;`
96+
```
97+
┌────────────┬─────────┬──────────────────────┬───┬──────────────────────┬───────────┐
98+
│ hgnc_id │ symbol │ name │ … │ mane_select │ gencc │
99+
│ varchar │ varchar │ varchar │ │ varchar │ varchar │
100+
├────────────┼─────────┼──────────────────────┼───┼──────────────────────┼───────────┤
101+
│ HGNC:2973 │ DNM1L │ dynamin 1 like │ … │ ENST00000549701.6|… │ HGNC:2973 │
102+
│ HGNC:21122 │ DNM1P5 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
103+
│ HGNC:21126 │ DNM1P9 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
104+
│ HGNC:21134 │ DNM1P17 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
105+
│ HGNC:21135 │ DNM1P18 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
106+
│ HGNC:21136 │ DNM1P19 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
107+
│ HGNC:35171 │ DNM1P24 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
108+
│ HGNC:35172 │ DNM1P25 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
109+
│ HGNC:35173 │ DNM1P26 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
110+
│ HGNC:35174 │ DNM1P27 │ dynamin 1 pseudoge… │ … │ NULL │ NULL │
111+
├────────────┴─────────┴──────────────────────┴───┴──────────────────────┴───────────┤
112+
│ 10 rows 54 columns (5 shown) │
113+
└────────────────────────────────────────────────────────────────────────────────────┘
114+
```
115+
116+
With the above you should see some data landing in ./data of the repository in both minio and pgdata.
117+
118+
You should already be dropped straight into a configured DuckLake using mino and pg against your local disk. But if you want to attach from the
119+
outside rather than from the container helper you will need to do the following in your duckdb client/.duckdbrc:
120+
121+
```sql
122+
INSTALL ducklake;
123+
INSTALL postgres;
124+
125+
SET s3_url_style = 'path';
126+
SET s3_endpoint = 'minio:9000';
127+
SET s3_access_key_id = 'minioadmin';
128+
SET s3_secret_access_key = 'minioadmin';
129+
SET s3_region = 'us-east-1';
130+
SET s3_use_ssl = false;
131+
132+
-- Auto-attach your lakehouse
133+
ATTACH 'ducklake:postgres:dbname=ducklake_catalog host=postgres user=ducklake password=ducklake'
134+
AS the_ducklake (DATA_PATH 's3://ducklake/lake/');
135+
```
136+
90137
## Configuration
91138

92139
All credentials and endpoints are controlled via environment variables in `docker-compose.yml`:

docker-compose.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ services:
77
POSTGRES_PASSWORD: ducklake
88
POSTGRES_DB: ducklake_catalog
99
volumes:
10-
- pgdata:/var/lib/postgresql/data
10+
- ./data/pgdata:/var/lib/postgresql/data
1111
healthcheck:
1212
test: ["CMD-SHELL", "pg_isready -U ducklake"]
1313
interval: 5s
@@ -24,7 +24,7 @@ services:
2424
ports:
2525
- "9000:9000"
2626
volumes:
27-
- miniodata:/data
27+
- ./data/minio:/data
2828
healthcheck:
2929
test: ["CMD", "curl", "-I", "http://localhost:9000/minio/health/live"]
3030
interval: 5s

ducklake-init/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ RUN apt-get update \
1111

1212
# Install the latest duckdb CLI client
1313
RUN curl https://install.duckdb.org | sh
14+
ENV PATH="/root/.duckdb/cli/latest:$PATH"
15+
CMD duckdb --version
16+
17+
# Copy DuckDB CLI init file so every duckdb session picks up MinIO settings
18+
COPY duckdbrc.tpl /duckdbrc.tpl
1419

1520
COPY entrypoint.py /entrypoint.py
1621
RUN chmod +x /entrypoint.py

ducklake-init/duckdbrc.tpl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
-- ducklake-init/duckdbrc.tpl
2+
3+
-- DuckDB + MinIO HTTPFS config
4+
SET s3_url_style = 'path';
5+
SET s3_endpoint = 'minio:9000';
6+
SET s3_access_key_id = '$AWS_ACCESS_KEY_ID';
7+
SET s3_secret_access_key = '$AWS_SECRET_ACCESS_KEY';
8+
SET s3_region = '$AWS_REGION';
9+
SET s3_use_ssl = false;
10+
11+
-- Auto-attach your lakehouse
12+
ATTACH 'ducklake:postgres:dbname=$POSTGRES_DB host=postgres user=$POSTGRES_USER password=$POSTGRES_PASSWORD'
13+
AS the_ducklake (DATA_PATH 's3://$BUCKET/lake/');
14+
USE the_ducklake;

ducklake-init/entrypoint.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# ///
1212

1313
import os
14+
from string import Template
1415
import time
1516

1617
import duckdb
@@ -20,6 +21,22 @@
2021
import psycopg
2122
import requests
2223

24+
25+
def dump_duckdbrc():
26+
tpl = Template(open("/duckdbrc.tpl").read())
27+
content = tpl.substitute(
28+
AWS_ACCESS_KEY_ID=os.environ["AWS_ACCESS_KEY_ID"],
29+
AWS_SECRET_ACCESS_KEY=os.environ["AWS_SECRET_ACCESS_KEY"],
30+
AWS_REGION=os.environ.get("AWS_REGION", "us-east-1"),
31+
POSTGRES_DB=os.environ["POSTGRES_DB"],
32+
POSTGRES_USER=os.environ["POSTGRES_USER"],
33+
POSTGRES_PASSWORD=os.environ["POSTGRES_PASSWORD"],
34+
BUCKET=os.environ["BUCKET"],
35+
)
36+
with open("/root/.duckdbrc", "w") as f:
37+
f.write(content)
38+
39+
2340
def wait_for_postgres(host="postgres", user=None, password=None, db=None, timeout=60):
2441
deadline = time.time() + timeout
2542
while time.time() < deadline:
@@ -56,7 +73,11 @@ def main():
5673
aws_ep = os.environ["AWS_ENDPOINT_URL"]
5774
bucket = os.environ["BUCKET"]
5875

59-
# Wait for dependencies
76+
# 1) Render ~/.duckdbrc so both CLI and Python API sessions will
77+
# pick up the HTTPFS settings + auto-ATTACH.
78+
dump_duckdbrc()
79+
80+
# 2) Wait on Postgres & MinIO, ensure bucket, then bootstrap/attach via Python API…
6081
wait_for_postgres(user=pg_user, password=pg_pass, db=pg_db)
6182
wait_for_minio(aws_ep)
6283

@@ -76,17 +97,31 @@ def main():
7697

7798
# Initialize or attach DuckLake
7899
con = duckdb.connect()
100+
101+
# 1) install extensions
79102
con.execute("INSTALL ducklake;")
80103
con.execute("INSTALL postgres;")
81104

105+
# 2) configure HTTPFS for MinIO
106+
for key, val in {
107+
"s3_url_style": "path",
108+
"s3_endpoint": "minio:9000",
109+
"s3_access_key_id": os.environ["AWS_ACCESS_KEY_ID"],
110+
"s3_secret_access_key": os.environ["AWS_SECRET_ACCESS_KEY"],
111+
"s3_region": os.environ.get("AWS_REGION", "us-east-1"),
112+
"s3_use_ssl": "false",
113+
}.items():
114+
con.execute(f"SET {key}='{val}';")
115+
116+
# 3) now attach/initialize DuckLake
82117
attach_sql = f"""
83118
ATTACH 'ducklake:postgres:dbname={pg_db} host=postgres user={pg_user} password={pg_pass}'
84119
AS the_ducklake (DATA_PATH 's3://{bucket}/lake/');
85120
"""
86121
con.execute(attach_sql)
87122
con.execute("USE the_ducklake;")
88123

89-
# Keep the container alive
124+
# 4)Keep the container alive
90125
print("DuckLake init complete; container is now running.")
91126
while True:
92127
time.sleep(3600)

0 commit comments

Comments
 (0)