Skip to content

Commit fd7cef7

Browse files
authored
Merge pull request #1 from knowit-solutions-cocreate/add-datastack-commpose
Add datastack commpose
2 parents 5407382 + ced91c4 commit fd7cef7

21 files changed

+4925
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Folders
77
_obj
88
_test
9+
test-dbt-project
910

1011
# IntelliJ
1112
.idea

nessie-stack/docker-compose.yml

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
version: '3.8'
2+
3+
services:
4+
minio:
5+
image: quay.io/minio/minio:latest
6+
container_name: minio
7+
command: server /data --console-address ":9001"
8+
environment:
9+
MINIO_ROOT_USER: minioadmin
10+
MINIO_ROOT_PASSWORD: minioadmin
11+
ports:
12+
- "9000:9000"
13+
- "9001:9001"
14+
volumes:
15+
- minio_data:/data
16+
networks:
17+
- common_network
18+
19+
minio-client:
20+
image: minio/mc
21+
depends_on:
22+
- minio
23+
entrypoint: >
24+
/bin/sh -c "
25+
sleep 5;
26+
mc alias set local http://minio:9000 minioadmin minioadmin;
27+
mc mb local/warehouse;
28+
exit 0;
29+
"
30+
networks:
31+
- common_network
32+
33+
nessie:
34+
image: projectnessie/nessie:latest
35+
container_name: nessie
36+
ports:
37+
- "19120:19120"
38+
environment:
39+
QUARKUS_HTTP_PORT: 19120
40+
networks:
41+
- common_network
42+
43+
spark-iceberg:
44+
container_name: spark-iceberg
45+
build: spark/
46+
depends_on:
47+
- nessie
48+
- minio
49+
volumes:
50+
- ./warehouse:/home/iceberg/warehouse
51+
- ./notebooks:/home/iceberg/notebooks/notebooks
52+
environment:
53+
- AWS_ACCESS_KEY_ID=minioadmin
54+
- AWS_SECRET_ACCESS_KEY=minioadmin
55+
- AWS_REGION=us-east-1
56+
ports:
57+
- 8888:8888
58+
- 8080:8080
59+
- 10000:10000
60+
- 10001:10001
61+
networks:
62+
- common_network
63+
64+
volumes:
65+
minio_data:
66+
67+
networks:
68+
common_network:
69+
driver: bridge

nessie-stack/notebooks/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
metastore*

nessie-stack/notebooks/.ipynb_checkpoints/test-checkpoint.ipynb

Lines changed: 210 additions & 0 deletions
Large diffs are not rendered by default.

nessie-stack/notebooks/test.ipynb

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from pyspark.sql import SparkSession\n",
10+
"\n",
11+
"ICEBERG_VERSION = \"1.8.1\"\n",
12+
"NESSIE_VERSION = \"0.103.0\"\n",
13+
"SPARK_VERSION = \"3.5\"\n",
14+
"\n",
15+
"spark = (\n",
16+
" SparkSession.builder.appName(\"IcebergNessieExample\")\n",
17+
" # Use JARs from local Spark installation\n",
18+
"# .config(\"spark.driver.extraClassPath\", \"/opt/spark/jars/*\")\n",
19+
"# .config(\"spark.executor.extraClassPath\", \"/opt/spark/jars/*\")\n",
20+
" # Use correct Iceberg & Nessie JARs for Spark 3.5\n",
21+
" # .config(\"spark.jars.packages\", f\"org.apache.iceberg:iceberg-spark-runtime-{SPARK_VERSION}_2.12:{ICEBERG_VERSION},\"\n",
22+
" # f\"org.projectnessie.nessie-integrations:nessie-spark-extensions-{SPARK_VERSION}_2.12:{NESSIE_VERSION}\")\n",
23+
" # .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n",
24+
" # .config(\"spark.sql.catalog.nessie\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
25+
" # .config(\"spark.sql.catalog.nessie.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n",
26+
" # .config(\"spark.sql.catalog.nessie.uri\", \"http://nessie:19120/api/v1\")\n",
27+
" # .config(\"spark.sql.catalog.nessie.ref\", \"main\")\n",
28+
" .config(\"spark.sql.catalog.nessie.warehouse\", \"s3a://warehouse/\")\n",
29+
" ### .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://minio:9000\")\n",
30+
"# .config(\"spark.hadoop.fs.s3a.access.key\", \"minioadmin\")\n",
31+
"## .config(\"spark.hadoop.fs.s3a.secret.key\", \"minioadmin\")\n",
32+
" # .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\n",
33+
" # .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
34+
" .getOrCreate()\n",
35+
")"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 3,
41+
"metadata": {},
42+
"outputs": [
43+
{
44+
"data": {
45+
"text/plain": [
46+
"[('spark.hadoop.fs.s3a.path.style.access', 'true'),\n",
47+
" ('spark.driver.port', '35325'),\n",
48+
" ('spark.sql.warehouse.dir',\n",
49+
" 'file:/home/iceberg/notebooks/notebooks/spark-warehouse'),\n",
50+
" ('spark.app.submitTime', '1742217572814'),\n",
51+
" ('spark.sql.catalog.iceberg.s3.path-style-access', 'true'),\n",
52+
" ('spark.sql.catalog.iceberg.s3.endpoint', 'http://minio:9000'),\n",
53+
" ('spark.sql.catalog.iceberg.type', 'nessie'),\n",
54+
" ('spark.app.id', 'local-1742217573458'),\n",
55+
" ('spark.hadoop.fs.s3a.access.key', 'minioadmin'),\n",
56+
" ('spark.serializer.objectStreamReset', '100'),\n",
57+
" ('spark.master', 'local[*]'),\n",
58+
" ('spark.driver.host', 'ff5452dce47b'),\n",
59+
" ('spark.submit.deployMode', 'client'),\n",
60+
" ('spark.sql.extensions',\n",
61+
" 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions'),\n",
62+
" ('spark.hadoop.fs.s3a.secret.key', 'minioadmin'),\n",
63+
" ('spark.driver.extraJavaOptions',\n",
64+
" '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n",
65+
" ('spark.sql.catalog.iceberg.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO'),\n",
66+
" ('spark.executor.id', 'driver'),\n",
67+
" ('spark.sql.catalog.iceberg.ref', 'main'),\n",
68+
" ('spark.app.name', 'PySparkShell'),\n",
69+
" ('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem'),\n",
70+
" ('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog'),\n",
71+
" ('spark.sql.catalogImplementation', 'hive'),\n",
72+
" ('spark.rdd.compress', 'True'),\n",
73+
" ('spark.executor.extraJavaOptions',\n",
74+
" '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'),\n",
75+
" ('spark.app.startTime', '1742217573143'),\n",
76+
" ('spark.submit.pyFiles', ''),\n",
77+
" ('spark.hadoop.fs.s3a.endpoint', 'http://minio:9000'),\n",
78+
" ('spark.sql.catalog.iceberg.warehouse', 's3a://warehouse'),\n",
79+
" ('spark.sql.catalog.iceberg.uri', 'http://nessie:19120/api/v1'),\n",
80+
" ('spark.ui.showConsoleProgress', 'true')]"
81+
]
82+
},
83+
"execution_count": 3,
84+
"metadata": {},
85+
"output_type": "execute_result"
86+
}
87+
],
88+
"source": [
89+
"spark.sparkContext.getConf().getAll()"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 4,
95+
"metadata": {},
96+
"outputs": [
97+
{
98+
"name": "stderr",
99+
"output_type": "stream",
100+
"text": [
101+
"25/03/17 13:19:42 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist\n",
102+
"25/03/17 13:19:42 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist\n",
103+
"25/03/17 13:19:43 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0\n",
104+
"25/03/17 13:19:43 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore [email protected]\n",
105+
"25/03/17 13:19:43 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException\n"
106+
]
107+
},
108+
{
109+
"data": {
110+
"text/plain": [
111+
"DataFrame[]"
112+
]
113+
},
114+
"execution_count": 4,
115+
"metadata": {},
116+
"output_type": "execute_result"
117+
}
118+
],
119+
"source": [
120+
"spark.sql('use iceberg')"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": 5,
126+
"metadata": {},
127+
"outputs": [
128+
{
129+
"data": {
130+
"text/plain": [
131+
"DataFrame[]"
132+
]
133+
},
134+
"execution_count": 5,
135+
"metadata": {},
136+
"output_type": "execute_result"
137+
}
138+
],
139+
"source": [
140+
"spark.sql('create namespace default')"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 6,
146+
"metadata": {},
147+
"outputs": [
148+
{
149+
"name": "stdout",
150+
"output_type": "stream",
151+
"text": [
152+
"+---+-----+-------+\n",
153+
"| id| name| salary|\n",
154+
"+---+-----+-------+\n",
155+
"| 1|Alice|75000.0|\n",
156+
"| 2| Bob|80000.0|\n",
157+
"+---+-----+-------+\n",
158+
"\n"
159+
]
160+
}
161+
],
162+
"source": [
163+
"spark.sql(\"CREATE TABLE IF NOT EXISTS default.employees (id INT, name STRING, salary DOUBLE) USING iceberg\")\n",
164+
"\n",
165+
"spark.sql(\"INSERT INTO default.employees VALUES (1, 'Alice', 75000), (2, 'Bob', 80000)\")\n",
166+
"\n",
167+
"df = spark.sql(\"SELECT * FROM default.employees\")\n",
168+
"df.show()"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": null,
174+
"metadata": {},
175+
"outputs": [],
176+
"source": []
177+
}
178+
],
179+
"metadata": {
180+
"kernelspec": {
181+
"display_name": "Python 3 (ipykernel)",
182+
"language": "python",
183+
"name": "python3"
184+
},
185+
"language_info": {
186+
"codemirror_mode": {
187+
"name": "ipython",
188+
"version": 3
189+
},
190+
"file_extension": ".py",
191+
"mimetype": "text/x-python",
192+
"name": "python",
193+
"nbconvert_exporter": "python",
194+
"pygments_lexer": "ipython3",
195+
"version": "3.10.16"
196+
}
197+
},
198+
"nbformat": 4,
199+
"nbformat_minor": 4
200+
}

nessie-stack/spark/.pyiceberg.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
#
19+
catalog:
20+
default:
21+
uri: http://rest:8181
22+
s3.endpoint: http://minio:9000
23+
s3.access-key-id: admin
24+
s3.secret-access-key: password

0 commit comments

Comments
 (0)