Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ data:
mainApplicationFile: local:///stackable/spark/jobs/spark-ingest-into-lakehouse.py
deps:
packages:
- org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1
- org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.0
- org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.2
s3connection:
reference: minio
Expand Down
57 changes: 56 additions & 1 deletion demos/data-lakehouse-iceberg-trino-spark/load-test-data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,62 @@ spec:
- name: load-test-data
image: "bitnami/minio:2024-debian-12"
# Please try to order the load jobs from small to large datasets
command: ["bash", "-c", "mc --insecure alias set minio http://minio:9000/ $(cat /minio-s3-credentials/accessKey) $(cat /minio-s3-credentials/secretKey) && cd /tmp && curl -O https://repo.stackable.tech/repository/misc/datasets/open-postcode-geo/open-postcode-geo.csv && mc cp open-postcode-geo.csv minio/staging/house-sales/postcode-geo-lookup/ && rm open-postcode-geo.csv && for year in {2005..2021}; do curl -O https://repo.stackable.tech/repository/misc/datasets/uk-house-sales/uk-house-sales-$year.csv && mc cp uk-house-sales-$year.csv minio/staging/house-sales/house-sales/ && rm uk-house-sales-$year.csv; done && curl -O https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv && mc cp earthquakes_1950_to_2022.csv minio/staging/earthquakes/earthquakes/ && rm earthquakes_1950_to_2022.csv && curl -O https://repo.stackable.tech/repository/misc/datasets/e-charging-stations/e-charging-stations-2022-08.csv && mc cp e-charging-stations-2022-08.csv minio/staging/smart-city/e-charging-stations/ && rm e-charging-stations-2022-08.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/taxi_zone_lookup.csv && mc cp taxi_zone_lookup.csv minio/staging/taxi/taxi-zone-lookup/ && rm taxi_zone_lookup.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/rate_code_lookup.csv && mc cp rate_code_lookup.csv minio/staging/taxi/rate-code-lookup/ && rm rate_code_lookup.csv && curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/payment_type_lookup.csv && mc cp payment_type_lookup.csv minio/staging/taxi/payment-type-lookup/ && rm payment_type_lookup.csv && for month in 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/green_tripdata_$month.parquet && mc cp green_tripdata_$month.parquet minio/staging/taxi/green-tripdata/ && rm green_tripdata_$month.parquet; done && for month in 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07 2015-08 2015-09 2015-10 2015-11 2015-12 2016-01 2016-02 2016-03 2016-04 2016-05 2016-06 2016-07 2016-08 2016-09 2016-10 2016-11 2016-12 2017-01 2017-02 2017-03 2017-04 2017-05 2017-06 2017-07 2017-08 2017-09 2017-10 2017-11 2017-12 2018-01 2018-02 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12 2019-01 2019-02 2019-03 2019-04 2019-05 2019-06 2019-07 2019-08 2019-09 2019-10 2019-11 2019-12 2020-01 2020-02 2020-03 2020-04 2020-05 2020-06 2020-07 2020-08 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06 2022-07 2022-08 2022-09 2022-10 2022-11 2022-12 2023-01 2023-02 2023-03 2023-04; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/yellow_tripdata_$month.parquet && mc cp yellow_tripdata_$month.parquet minio/staging/taxi/yellow-tripdata/ && rm yellow_tripdata_$month.parquet; done && for month in 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do curl -O https://repo.stackable.tech/repository/misc/ny-taxi-data/fhvhv_tripdata_$month.parquet && mc cp fhvhv_tripdata_$month.parquet minio/staging/taxi/fhvhv-tripdata/ && rm fhvhv_tripdata_$month.parquet; done"]
command:
- bash
- -xeuo
- pipefail
- -c
- |
mc --insecure alias set minio http://minio:9000/ $(cat /minio-s3-credentials/accessKey) $(cat /minio-s3-credentials/secretKey)
cd /tmp

curl -sO https://repo.stackable.tech/repository/misc/datasets/open-postcode-geo/open-postcode-geo.csv
mc cp open-postcode-geo.csv minio/staging/house-sales/postcode-geo-lookup/
rm open-postcode-geo.csv

for year in {2005..2021}; do
curl -sO https://repo.stackable.tech/repository/misc/datasets/uk-house-sales/uk-house-sales-$year.csv
mc cp uk-house-sales-$year.csv minio/staging/house-sales/house-sales/
rm uk-house-sales-$year.csv
done

curl -sO https://repo.stackable.tech/repository/misc/earthquake-data/earthquakes_1950_to_2022.csv
mc cp earthquakes_1950_to_2022.csv minio/staging/earthquakes/earthquakes/
rm earthquakes_1950_to_2022.csv

curl -sO https://repo.stackable.tech/repository/misc/datasets/e-charging-stations/e-charging-stations-2022-08.csv
mc cp e-charging-stations-2022-08.csv minio/staging/smart-city/e-charging-stations/
rm e-charging-stations-2022-08.csv

curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/taxi_zone_lookup.csv
mc cp taxi_zone_lookup.csv minio/staging/taxi/taxi-zone-lookup/
rm taxi_zone_lookup.csv

curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/rate_code_lookup.csv
mc cp rate_code_lookup.csv minio/staging/taxi/rate-code-lookup/
rm rate_code_lookup.csv

curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/payment_type_lookup.csv
mc cp payment_type_lookup.csv minio/staging/taxi/payment-type-lookup/
rm payment_type_lookup.csv

for month in 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do
curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/green_tripdata_$month.parquet
mc cp green_tripdata_$month.parquet minio/staging/taxi/green-tripdata/
rm green_tripdata_$month.parquet
done

for month in 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07 2015-08 2015-09 2015-10 2015-11 2015-12 2016-01 2016-02 2016-03 2016-04 2016-05 2016-06 2016-07 2016-08 2016-09 2016-10 2016-11 2016-12 2017-01 2017-02 2017-03 2017-04 2017-05 2017-06 2017-07 2017-08 2017-09 2017-10 2017-11 2017-12 2018-01 2018-02 2018-03 2018-04 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12 2019-01 2019-02 2019-03 2019-04 2019-05 2019-06 2019-07 2019-08 2019-09 2019-10 2019-11 2019-12 2020-01 2020-02 2020-03 2020-04 2020-05 2020-06 2020-07 2020-08 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06 2022-07 2022-08 2022-09 2022-10 2022-11 2022-12 2023-01 2023-02 2023-03 2023-04; do
curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/yellow_tripdata_$month.parquet
mc cp yellow_tripdata_$month.parquet minio/staging/taxi/yellow-tripdata/
rm yellow_tripdata_$month.parquet
done

for month in 2020-09 2020-10 2020-11 2020-12 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12 2022-01 2022-02 2022-03 2022-04 2022-05 2022-06; do
curl -sO https://repo.stackable.tech/repository/misc/ny-taxi-data/fhvhv_tripdata_$month.parquet
mc cp fhvhv_tripdata_$month.parquet minio/staging/taxi/fhvhv-tripdata/
rm fhvhv_tripdata_$month.parquet
done
volumeMounts:
- name: minio-s3-credentials
mountPath: /minio-s3-credentials
Expand Down
10 changes: 8 additions & 2 deletions docs/modules/demos/pages/data-lakehouse-iceberg-trino-spark.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -539,8 +539,14 @@ On the left, select the database `Trino lakehouse`, the schema `house_sales`, an

[IMPORTANT]
====
The older screenshot below shows how the table preview would look like. Currently, there is an https://github.com/apache/superset/issues/25307[open issue] with previewing trino tables using the Iceberg connector.
This doesn't affect the execution the following execution of the SQL statement.
Currently, there is an https://github.com/apache/superset/issues/25307[open issue] with previewing trino tables using the Iceberg connector.
Until it is fixed, you will see the following error:

```
trino error: TrinoUserError(type=USER_ERROR, name=COLUMN_NOT_FOUND, message="line 3:7: Column 'partition' cannot be resolved", query_id=20241114_162624_00095_6hvqh)
```

This does not affect the execution of the following SQL statement.
====

image::data-lakehouse-iceberg-trino-spark/superset_8.png[]
Expand Down