diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6f266ae7..6912ad87 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -39,7 +39,7 @@ jobs: tag: credit-card-views test_commands: | compile -c creditcard_views_package_test.json - # FIXME test -c creditcard_views_package_test.json + test -c creditcard_views_package_test.json - example: healthcare-study-analytics path: healthcare-study/healthcare-study-analytics @@ -103,7 +103,7 @@ jobs: env: TZ: 'America/Los_Angeles' - SQRL_VERSION: 'dev' + SQRL_VERSION: 'latest' steps: - uses: actions/checkout@v4 diff --git a/finance-credit-card-chatbot/credit-card-views/creditcard_views.sqrl b/finance-credit-card-chatbot/credit-card-views/creditcard_views.sqrl index 016d1882..2817f156 100644 --- a/finance-credit-card-chatbot/credit-card-views/creditcard_views.sqrl +++ b/finance-credit-card-chatbot/credit-card-views/creditcard_views.sqrl @@ -33,5 +33,8 @@ CustomerTransactionWithMerchant := /* =======TEST CASES======== */ /*+test */ -SpendingByCategoryTest := SELECT * FROM SpendingByCategory ORDER BY customerId DESC, timeWeek DESC limit 5; +SpendingByCategoryTest := SELECT * + FROM SpendingByCategory + ORDER BY customerId DESC, timeWeek DESC, spending DESC + LIMIT 5; diff --git a/finance-credit-card-chatbot/credit-card-views/creditcard_views_package_snowflake.json b/finance-credit-card-chatbot/credit-card-views/creditcard_views_package_snowflake.json index 95d42ab7..4bacd1b2 100644 --- a/finance-credit-card-chatbot/credit-card-views/creditcard_views_package_snowflake.json +++ b/finance-credit-card-chatbot/credit-card-views/creditcard_views_package_snowflake.json @@ -13,7 +13,7 @@ "snowflake" : { "catalog-name": "MyCatalog", "external-volume": "MyNewVolume", - "url": "jdbc:snowflake://${SNOWFLAKE_ID}.snowflakecomputing.com/?user=${SNOWFLAKE_USER}&password=${SNOWFLAKE_PASSWORD}&warehouse=COMPUTE_WH&db=MYSNOWFLAKEDB&schema=public&disableSslHostnameVerification=true" + "url": "${SNOWFLAKE_JDBC_URL}" } }, "connectors" : { @@ -21,7 +21,7 @@ "warehouse":"s3://my-iceberg-table-test", "catalog-impl":"org.apache.iceberg.aws.glue.GlueCatalog", "io-impl":"org.apache.iceberg.aws.s3.S3FileIO", - "catalog-name": "mydatabase", + "catalog-name": "mycatalog", "catalog-database": "mydatabase" } }, diff --git a/finance-credit-card-chatbot/credit-card-views/snapshots/SpendingByCategoryTest.snapshot b/finance-credit-card-chatbot/credit-card-views/snapshots/SpendingByCategoryTest.snapshot index 19c4ce94..d8b5b010 100644 --- a/finance-credit-card-chatbot/credit-card-views/snapshots/SpendingByCategoryTest.snapshot +++ b/finance-credit-card-chatbot/credit-card-views/snapshots/SpendingByCategoryTest.snapshot @@ -3,28 +3,28 @@ "SpendingByCategoryTest" : [ { "customerId" : 10, "timeWeek" : "2024-05-30T13:59:59.999Z", - "category" : "Entertainment", - "spending" : 263.24 + "category" : "Housing & Utilities", + "spending" : 4767.13 }, { "customerId" : 10, "timeWeek" : "2024-05-30T13:59:59.999Z", - "category" : "Groceries", - "spending" : 1529.74 + "category" : "Miscellaneous", + "spending" : 2431.46 }, { "customerId" : 10, "timeWeek" : "2024-05-30T13:59:59.999Z", - "category" : "Communication", - "spending" : 198.98 + "category" : "Travel & Vacations", + "spending" : 2088.6 }, { "customerId" : 10, "timeWeek" : "2024-05-30T13:59:59.999Z", - "category" : "Clothing & Apparel", - "spending" : 976.09 + "category" : "Groceries", + "spending" : 1529.74 }, { "customerId" : 10, "timeWeek" : "2024-05-30T13:59:59.999Z", - "category" : "Health & Wellness", - "spending" : 493.13 + "category" : "Restaurants & Dining", + "spending" : 1478.88 } ] } } \ No newline at end of file diff --git a/getting-started-examples/01_kafka_to_console/README.md b/getting-started-examples/01_kafka_to_console/README.md index 651a15a5..18710aa2 100644 --- a/getting-started-examples/01_kafka_to_console/README.md +++ b/getting-started-examples/01_kafka_to_console/README.md @@ -1,8 +1,8 @@ -# Kafka-to-Kafka with Avro using DataSQRL +# Kafka to Console with Avro using DataSQRL This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: -- Reads data from a kafka topic and prints output to console +- Reads data from a Kafka topic and prints output to console - Kafka is part of the DataSQRL package. ## 🐳 Running DataSQRL @@ -10,7 +10,7 @@ This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a Run the following command from the project root where your `package.json` and SQRL scripts reside: ```bash -docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:0.7.1 run -c package.json +docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:latest run -c package.json ``` ## Generate Data diff --git a/getting-started-examples/02_kafka_to_kafka/README.md b/getting-started-examples/02_kafka_to_kafka/README.md index d1c11a02..f4844aec 100644 --- a/getting-started-examples/02_kafka_to_kafka/README.md +++ b/getting-started-examples/02_kafka_to_kafka/README.md @@ -1,8 +1,8 @@ -# Kafka-to-Kafka with Avro using DataSQRL +# Kafka to Kafka with Avro using DataSQRL This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: -- Reads data from a kafka topic and writes to another kafka topic +- Reads data from a Kafka topic and writes to another Kafka topic - Kafka is part of the DataSQRL package. ## 🐳 Running DataSQRL @@ -10,7 +10,7 @@ This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a Run the following command from the project root where your `package.json` and SQRL scripts reside: ```bash -docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:0.7.1 run -c package.json +docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:latest run -c package.json ``` ## Generate Data diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/README.md b/getting-started-examples/03_kafka_join/README.md similarity index 85% rename from getting-started-examples/03_two_streams_kafka_to_kafka/README.md rename to getting-started-examples/03_kafka_join/README.md index 59e3886f..e7635b1c 100644 --- a/getting-started-examples/03_two_streams_kafka_to_kafka/README.md +++ b/getting-started-examples/03_kafka_join/README.md @@ -1,8 +1,8 @@ -# Kafka-to-Kafka with Avro using DataSQRL +# Kafka Join using DataSQRL This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: -- Reads data from two kafka topics and combines the data from two streams using temporal join +- Reads data from two Kafka topics and combines the data from two streams using temporal join - writes output to another kafka topic - Kafka is part of datasqrl package. @@ -13,7 +13,7 @@ This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a Run the following command from the project root where your `package.json` and SQRL scripts reside: ```bash -docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:0.7.1 run -c package.json +docker run -it --rm -p 8888:8888 -p 9092:9092 -v $PWD:/build datasqrl/cmd:latest run -c package.json ``` ## Generate Data diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/contact.jsonl b/getting-started-examples/03_kafka_join/data-generator/contact.jsonl similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/contact.jsonl rename to getting-started-examples/03_kafka_join/data-generator/contact.jsonl diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/contact.table.sql b/getting-started-examples/03_kafka_join/data-generator/contact.table.sql similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/contact.table.sql rename to getting-started-examples/03_kafka_join/data-generator/contact.table.sql diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/load_data.py b/getting-started-examples/03_kafka_join/data-generator/load_data.py similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/load_data.py rename to getting-started-examples/03_kafka_join/data-generator/load_data.py diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/organization.jsonl b/getting-started-examples/03_kafka_join/data-generator/organization.jsonl similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/organization.jsonl rename to getting-started-examples/03_kafka_join/data-generator/organization.jsonl diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/organization.table.sql b/getting-started-examples/03_kafka_join/data-generator/organization.table.sql similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/data-generator/organization.table.sql rename to getting-started-examples/03_kafka_join/data-generator/organization.table.sql diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/kafka-sink/enrichedcontact.table.sql b/getting-started-examples/03_kafka_join/kafka-sink/enrichedcontact.table.sql similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/kafka-sink/enrichedcontact.table.sql rename to getting-started-examples/03_kafka_join/kafka-sink/enrichedcontact.table.sql diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/kafka-source/Contact.table.sql b/getting-started-examples/03_kafka_join/kafka-source/Contact.table.sql similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/kafka-source/Contact.table.sql rename to getting-started-examples/03_kafka_join/kafka-source/Contact.table.sql diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/kafka-source/Organization.table.sql b/getting-started-examples/03_kafka_join/kafka-source/Organization.table.sql similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/kafka-source/Organization.table.sql rename to getting-started-examples/03_kafka_join/kafka-source/Organization.table.sql diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/kafka-test.sqrl b/getting-started-examples/03_kafka_join/kafka-test.sqrl similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/kafka-test.sqrl rename to getting-started-examples/03_kafka_join/kafka-test.sqrl diff --git a/getting-started-examples/03_two_streams_kafka_to_kafka/package.json b/getting-started-examples/03_kafka_join/package.json similarity index 100% rename from getting-started-examples/03_two_streams_kafka_to_kafka/package.json rename to getting-started-examples/03_kafka_join/package.json diff --git a/getting-started-examples/05_file_iceberg_test/README.md b/getting-started-examples/04_file_to_iceberg_test/README.md similarity index 73% rename from getting-started-examples/05_file_iceberg_test/README.md rename to getting-started-examples/04_file_to_iceberg_test/README.md index 0b0239fb..a275f03b 100644 --- a/getting-started-examples/05_file_iceberg_test/README.md +++ b/getting-started-examples/04_file_to_iceberg_test/README.md @@ -9,7 +9,7 @@ This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a Run the following command from the project root where your `package.json` and SQRL scripts reside: ```bash -docker run -it --rm -p 8888:8888 -p 8081:8081 -v $PWD:/build -v $PWD/data:/data datasqrl/cmd:0.7.1 run -c package.json +docker run -it --rm -p 8888:8888 -p 8081:8081 -v $PWD:/build datasqrl/cmd:latest run -c package.json ``` > [!NOTE] @@ -17,5 +17,5 @@ docker run -it --rm -p 8888:8888 -p 8081:8081 -v $PWD:/build -v $PWD/data:/data ## Output -* There should be iceberg files and folders generated in `$PWD/data/iceberg` directory +* There should be iceberg files and folders generated in `$PWD/warehouse` directory * Data for the output table will reside in `ProcessedData` (as defined in the sqrl script) diff --git a/getting-started-examples/05_file_iceberg_test/file-to-iceberg.sqrl b/getting-started-examples/04_file_to_iceberg_test/file-to-iceberg.sqrl similarity index 100% rename from getting-started-examples/05_file_iceberg_test/file-to-iceberg.sqrl rename to getting-started-examples/04_file_to_iceberg_test/file-to-iceberg.sqrl diff --git a/getting-started-examples/05_file_iceberg_test/package.json b/getting-started-examples/04_file_to_iceberg_test/package.json similarity index 91% rename from getting-started-examples/05_file_iceberg_test/package.json rename to getting-started-examples/04_file_to_iceberg_test/package.json index b95113b0..85ce8532 100644 --- a/getting-started-examples/05_file_iceberg_test/package.json +++ b/getting-started-examples/04_file_to_iceberg_test/package.json @@ -14,7 +14,7 @@ }, "connectors": { "iceberg": { - "warehouse": "/data/iceberg", + "warehouse": "warehouse", "catalog-type": "hadoop", "catalog-name": "mycatalog" } diff --git a/getting-started-examples/05_file_iceberg_test/testdata/MyFile.table.sql b/getting-started-examples/04_file_to_iceberg_test/testdata/MyFile.table.sql similarity index 100% rename from getting-started-examples/05_file_iceberg_test/testdata/MyFile.table.sql rename to getting-started-examples/04_file_to_iceberg_test/testdata/MyFile.table.sql diff --git a/getting-started-examples/05_file_iceberg_test/testdata/myfile.jsonl b/getting-started-examples/04_file_to_iceberg_test/testdata/myfile.jsonl similarity index 100% rename from getting-started-examples/05_file_iceberg_test/testdata/myfile.jsonl rename to getting-started-examples/04_file_to_iceberg_test/testdata/myfile.jsonl diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/README.md b/getting-started-examples/04_two_streams_external_kafka_to_kafka/README.md deleted file mode 100644 index 4c272b78..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# Kafka-to-Kafka with Avro using DataSQRL - -This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: - -- This example uses kafka that is running outside of docker on host machine -- Reads data from two kafka topics and combines the data from two streams using temporal join -- Writes output to another kafka topic running on host machine -- We are not using kafka running inside DataSQRL - -## Few things to note - -* `'properties.bootstrap.servers' = 'host.docker.internal:9092'` -> this tells docker to connect to your host machine's kafka -* You don't need to create output topic `enrichedcontact` -* `create-topics` array from `package.json` was removed since we are using external Kafka, where we expect source topics to be present -* We removed kafka engine from `enabled-engines` array in `package.json` - -## 🐳 Running DataSQRL - -Run the following command from the project root where your `package.json` and SQRL scripts reside: -```bash -docker run -it --rm -p 8888:8888 -p 8081:8081 -v $PWD:/build -v $PWD/data:/data datasqrl/cmd:0.7.1 run -c package.json -``` -> [!NOTE] -> We removed `-p 9092:9092` as we are using our own kafka running locally on host machine now - -## Generate Data - -* Go to `data-generator` folder - * `python3 load_data.py ` -* To send Contact data -```bash - python3 load_data.py contact.jsonl localhost:9092 contact -``` -* To send Organization data -```bash - python3 load_data.py organization.jsonl localhost:9092 organization -``` - -## Output - -* Updated records should be generated in `enrichedcontact` table. diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/docker-compose.yml b/getting-started-examples/04_two_streams_external_kafka_to_kafka/docker-compose.yml deleted file mode 100644 index 465052e8..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/docker-compose.yml +++ /dev/null @@ -1,30 +0,0 @@ -version: '3.8' - -services: - zookeeper: - image: confluentinc/cp-zookeeper:7.5.0 - hostname: zookeeper - container_name: zookeeper - ports: - - "2181:2181" - environment: - ZOOKEEPER_CLIENT_PORT: 2181 - ZOOKEEPER_TICK_TIME: 2000 - - kafka: - image: confluentinc/cp-kafka:7.5.0 - hostname: kafka - container_name: kafka - depends_on: - - zookeeper - ports: - - "9092:9092" - environment: - KAFKA_BROKER_ID: 1 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:29092,PLAINTEXT_HOST://0.0.0.0:9092 - KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 - diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-sink/EnrichedContact.table.sql b/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-sink/EnrichedContact.table.sql deleted file mode 100644 index 525f0216..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-sink/EnrichedContact.table.sql +++ /dev/null @@ -1,8 +0,0 @@ -CREATE TABLE EnrichedContact ( - WATERMARK FOR last_updated AS last_updated - INTERVAL '30' SECOND -) WITH ( - 'connector' = 'kafka', - 'topic' = 'enrichedcontact', - 'properties.bootstrap.servers' = 'host.docker.internal:9092', - 'format' = 'flexible-json' -); diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Contact.table.sql b/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Contact.table.sql deleted file mode 100644 index 3475ff42..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Contact.table.sql +++ /dev/null @@ -1,14 +0,0 @@ -CREATE TABLE Contact ( - id BIGINT, - firstname STRING, - lastname STRING, - last_updated TIMESTAMP_LTZ(3) METADATA FROM 'timestamp', - WATERMARK FOR last_updated AS last_updated - INTERVAL '30' SECOND -) WITH ( - 'connector' = 'kafka', - 'topic' = 'contact', - 'properties.bootstrap.servers' = 'host.docker.internal:9092', - 'properties.group.id' = 'group1_contacts', - 'scan.startup.mode' = 'earliest-offset', - 'format' = 'flexible-json' -); diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Organization.table.sql b/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Organization.table.sql deleted file mode 100644 index be6de9c0..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-source/Organization.table.sql +++ /dev/null @@ -1,14 +0,0 @@ -CREATE TABLE Organization ( - userid BIGINT, - orgid BIGINT, - orgname STRING, - last_updated TIMESTAMP_LTZ(3) METADATA FROM 'timestamp', - WATERMARK FOR last_updated AS last_updated - INTERVAL '30' SECOND -) WITH ( - 'connector' = 'kafka', - 'topic' = 'organization', - 'properties.bootstrap.servers' = 'host.docker.internal:9092', - 'properties.group.id' = 'group1_organization', - 'scan.startup.mode' = 'earliest-offset', - 'format' = 'flexible-json' -); diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-test.sqrl b/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-test.sqrl deleted file mode 100644 index 94496fe1..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/kafka-test.sqrl +++ /dev/null @@ -1,15 +0,0 @@ -IMPORT kafka-source.Contact AS Contacts; -IMPORT kafka-source.Organization AS Organizations; - -/* - Joins Contacts and Organizations - Matches rows where the contact's id equals the organization's userid - But only if their last_updated timestamps are within Β±5 seconds of each other -*/ -EnrichedContacts := SELECT c.id, c.firstname, c.lastname, o.orgname, c.last_updated - FROM Contacts c - JOIN Organizations o - ON c.id = o.userid - AND c.last_updated BETWEEN o.last_updated - INTERVAL '30' SECOND AND o.last_updated + INTERVAL '30' SECOND; - -EXPORT EnrichedContacts TO kafka-sink.EnrichedContact; diff --git a/getting-started-examples/04_two_streams_external_kafka_to_kafka/package.json b/getting-started-examples/04_two_streams_external_kafka_to_kafka/package.json deleted file mode 100644 index 38e97cb4..00000000 --- a/getting-started-examples/04_two_streams_external_kafka_to_kafka/package.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "version": "1", - "enabled-engines": ["vertx", "postgres", "flink"], - "script": { - "main": "kafka-test.sqrl" - }, - "engines": { - "flink": { - "config": { - "table.exec.source.idle-timeout": "60 s" - } - } - } -} diff --git a/getting-started-examples/05_kafka_to_iceberg_local_test/README.md b/getting-started-examples/05_kafka_to_iceberg_local_test/README.md new file mode 100644 index 00000000..679e5926 --- /dev/null +++ b/getting-started-examples/05_kafka_to_iceberg_local_test/README.md @@ -0,0 +1,27 @@ +# Kafka to Local Iceberg Warehouse Using DataSQRL + +This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: + +- Reads data from a Kafka topic +- Writes data to an Iceberg table locally + +## 🐳 Running DataSQRL + +Run the following command from the project root where your `package.json` and SQRL scripts reside: +```bash +docker run -it --rm -p 8888:8888 -p 8081:8081 -p 9092:9092 -v $PWD:/build datasqrl/cmd:latest run -c package.json +``` + +## Generate Data + +* Go to `data-generator` folder + * `python3 load_data.py ` +* To send Contact data +```bash +python3 load_data.py contacts.jsonl contacts +``` + +## Output + +* There should be iceberg files and folders generated in `$PWD/warehouse` directory +* Data for the output table will reside in `MyContacts` (as defined in the sqrl script) diff --git a/getting-started-examples/06_external_kafka_iceberg_test/data-generator/contacts.jsonl b/getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/contacts.jsonl similarity index 100% rename from getting-started-examples/06_external_kafka_iceberg_test/data-generator/contacts.jsonl rename to getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/contacts.jsonl diff --git a/getting-started-examples/06_external_kafka_iceberg_test/data-generator/contacts.table.sql b/getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/contacts.table.sql similarity index 100% rename from getting-started-examples/06_external_kafka_iceberg_test/data-generator/contacts.table.sql rename to getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/contacts.table.sql diff --git a/getting-started-examples/06_external_kafka_iceberg_test/data-generator/load_data.py b/getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/load_data.py similarity index 100% rename from getting-started-examples/06_external_kafka_iceberg_test/data-generator/load_data.py rename to getting-started-examples/05_kafka_to_iceberg_local_test/data-generator/load_data.py diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-source/Contact.table.sql b/getting-started-examples/05_kafka_to_iceberg_local_test/kafka-source/Contact.table.sql similarity index 69% rename from getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-source/Contact.table.sql rename to getting-started-examples/05_kafka_to_iceberg_local_test/kafka-source/Contact.table.sql index 01dc223b..5dabb8d0 100644 --- a/getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-source/Contact.table.sql +++ b/getting-started-examples/05_kafka_to_iceberg_local_test/kafka-source/Contact.table.sql @@ -6,9 +6,9 @@ CREATE TABLE Contact ( WATERMARK FOR last_updated AS last_updated - INTERVAL '1' SECOND ) WITH ( 'connector' = 'kafka', - 'topic' = 'contact', - 'properties.bootstrap.servers' = 'host.docker.internal:9092', - 'properties.group.id' = 'group1_contacts', + 'topic' = 'contacts', + 'properties.bootstrap.servers' = '${KAFKA_BOOTSTRAP_SERVERS}', + 'properties.group.id' = '${KAFKA_GROUP_ID}', 'scan.startup.mode' = 'earliest-offset', 'format' = 'flexible-json' ); diff --git a/getting-started-examples/05_kafka_to_iceberg_local_test/kafka-to-iceberg.sqrl b/getting-started-examples/05_kafka_to_iceberg_local_test/kafka-to-iceberg.sqrl new file mode 100644 index 00000000..a34a74cc --- /dev/null +++ b/getting-started-examples/05_kafka_to_iceberg_local_test/kafka-to-iceberg.sqrl @@ -0,0 +1,3 @@ +IMPORT kafka-source.Contact AS _Contacts; + +MyContacts := SELECT id, firstname, lastname, last_updated FROM _Contacts; diff --git a/getting-started-examples/06_external_kafka_iceberg_test/package.json b/getting-started-examples/05_kafka_to_iceberg_local_test/package.json similarity index 71% rename from getting-started-examples/06_external_kafka_iceberg_test/package.json rename to getting-started-examples/05_kafka_to_iceberg_local_test/package.json index 40dd2d1c..f757ad69 100644 --- a/getting-started-examples/06_external_kafka_iceberg_test/package.json +++ b/getting-started-examples/05_kafka_to_iceberg_local_test/package.json @@ -1,6 +1,6 @@ { "version": "1", - "enabled-engines": ["flink", "iceberg"], + "enabled-engines": ["flink", "kafka", "iceberg"], "script": { "main": "kafka-to-iceberg.sqrl" }, @@ -14,9 +14,12 @@ }, "connectors": { "iceberg": { - "warehouse": "/data/iceberg", + "warehouse": "warehouse", "catalog-type": "hadoop", "catalog-name": "mycatalog" } + }, + "test-runner": { + "create-topics": ["contacts"] } } diff --git a/getting-started-examples/06_external_kafka_iceberg_test/README.md b/getting-started-examples/06_external_kafka_iceberg_test/README.md deleted file mode 100644 index c70d086a..00000000 --- a/getting-started-examples/06_external_kafka_iceberg_test/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Kafka-to-Kafka with Avro using DataSQRL - -This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: - -- This example uses Kafka that is running outside DataSQRL docker on host machine -- Reads data from kafka topic and writes to an iceberg table locally - -## 🐳 Running DataSQRL - -Run the following command from the project root where your `package.json` and SQRL scripts reside: -```bash -docker run -it --rm -p 8888:8888 -p 8081:8081 -v $PWD:/build -v $PWD/data:/data datasqrl/cmd:0.7.1 run -c package.json -``` -> [!NOTE] -> We removed `-p 9092:9092` as we are using our own kafka running locally on host machine now - -## Generate Data - -* Go to `data-generator` folder - * `python3 load_data.py ` -* To send Contact data -```bash -python3 load_data.py contacts.jsonl contact -``` - -## Output - -* Updated records should be generated in `enrichedcontact` topic. diff --git a/getting-started-examples/06_external_kafka_iceberg_test/iceberg-sink/EnrichedContact.table.sql b/getting-started-examples/06_external_kafka_iceberg_test/iceberg-sink/EnrichedContact.table.sql deleted file mode 100644 index e41ee852..00000000 --- a/getting-started-examples/06_external_kafka_iceberg_test/iceberg-sink/EnrichedContact.table.sql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE TABLE EnrichedContact ( - WATERMARK FOR last_updated AS last_updated - INTERVAL '1' SECOND -) WITH ( - 'connector' = 'iceberg', - 'catalog-name' = 'mycatalog', - 'catalog-type' = 'hadoop', - 'warehouse' = '/data/iceberg', - 'format-version' = '2' -); diff --git a/getting-started-examples/06_external_kafka_iceberg_test/kafka-to-iceberg.sqrl b/getting-started-examples/06_external_kafka_iceberg_test/kafka-to-iceberg.sqrl deleted file mode 100644 index c479d146..00000000 --- a/getting-started-examples/06_external_kafka_iceberg_test/kafka-to-iceberg.sqrl +++ /dev/null @@ -1,8 +0,0 @@ -IMPORT kafka-source.Contact AS Contacts; - -EnrichedContacts := SELECT - id, - firstname, - lastname, - last_updated -FROM Contacts; diff --git a/getting-started-examples/06_kafka_to_iceberg_glue_test/README.md b/getting-started-examples/06_kafka_to_iceberg_glue_test/README.md new file mode 100644 index 00000000..8c0e809e --- /dev/null +++ b/getting-started-examples/06_kafka_to_iceberg_glue_test/README.md @@ -0,0 +1,53 @@ +# Kafka to Iceberg in S3 with AWS Glue Using DataSQRL + +This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: + +- Reads data from a Kafka topic +- Writes data to S3 in Iceberg table format via AWS Glue + +## ☁️ AWS Prerequisites + +1. Create an AWS S3 bucket (e.g. `my-iceberg-table-test`) that you will use for the test ([docs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/create-bucket-overview.html)) + 1. Note the region where you created the bucket +2. Create an AWS token for your user, that can be used for Flink/Iceberg to auth + 1. Sign in to AWS Console β†’ Go to IAM service + 2. Navigate to *Users* β†’ Find your username β†’ Click on it + 3. Security credentials tab β†’ Scroll to *Access keys* + 4. Click *Create access key* + 5. Choose use case (*Application running outside AWS*) β†’ Next + 6. Copy both values or download: `AWS_ACCESS_KEY_ID` (starts with AKIA...) `AWS_SECRET_ACCESS_KEY` (long random string) +3. Make sure that your user has S3 and Glue access +4. Create a Glue database + 1. Go to AWS Glue Service + 2. Left sidebar β†’ Click on "Data Catalog Tables" β†’ Click β€œDatabases” + 3. Click *Add database* + 4. Name it `mydatabase` (has to match with what is in the `package.json` config) + +## 🐳 Running DataSQRL + +Run the following command from the project root where your `package.json` and SQRL scripts reside: +```bash +docker run -it --rm \ + -p 8888:8888 \ + -p 8081:8081 \ + -p 9092:9092 \ + -v $PWD:/build \ + -e AWS_ACCESS_KEY_ID="" \ + -e AWS_SECRET_ACCESS_KEY=" ` +* To send Contact data +```bash +python3 load_data.py contacts.jsonl contacts +``` + +## Output + +* Records should show up in the S3 bucket under `s3://my-iceberg-table-test/mydatabase.db/mycontacts`, + and they are queryable via Amazon Athena. diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/contacts.jsonl b/getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/contacts.jsonl similarity index 100% rename from getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/contacts.jsonl rename to getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/contacts.jsonl diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/contacts.table.sql b/getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/contacts.table.sql similarity index 100% rename from getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/contacts.table.sql rename to getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/contacts.table.sql diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/load_data.py b/getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/load_data.py similarity index 100% rename from getting-started-examples/07_external_kafka_iceberg_glue_test/data-generator/load_data.py rename to getting-started-examples/06_kafka_to_iceberg_glue_test/data-generator/load_data.py diff --git a/getting-started-examples/06_external_kafka_iceberg_test/kafka-source/Contact.table.sql b/getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-source/Contact.table.sql similarity index 69% rename from getting-started-examples/06_external_kafka_iceberg_test/kafka-source/Contact.table.sql rename to getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-source/Contact.table.sql index 01dc223b..5dabb8d0 100644 --- a/getting-started-examples/06_external_kafka_iceberg_test/kafka-source/Contact.table.sql +++ b/getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-source/Contact.table.sql @@ -6,9 +6,9 @@ CREATE TABLE Contact ( WATERMARK FOR last_updated AS last_updated - INTERVAL '1' SECOND ) WITH ( 'connector' = 'kafka', - 'topic' = 'contact', - 'properties.bootstrap.servers' = 'host.docker.internal:9092', - 'properties.group.id' = 'group1_contacts', + 'topic' = 'contacts', + 'properties.bootstrap.servers' = '${KAFKA_BOOTSTRAP_SERVERS}', + 'properties.group.id' = '${KAFKA_GROUP_ID}', 'scan.startup.mode' = 'earliest-offset', 'format' = 'flexible-json' ); diff --git a/getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-to-iceberg.sqrl b/getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-to-iceberg.sqrl new file mode 100644 index 00000000..a34a74cc --- /dev/null +++ b/getting-started-examples/06_kafka_to_iceberg_glue_test/kafka-to-iceberg.sqrl @@ -0,0 +1,3 @@ +IMPORT kafka-source.Contact AS _Contacts; + +MyContacts := SELECT id, firstname, lastname, last_updated FROM _Contacts; diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/package.json b/getting-started-examples/06_kafka_to_iceberg_glue_test/package.json similarity index 67% rename from getting-started-examples/07_external_kafka_iceberg_glue_test/package.json rename to getting-started-examples/06_kafka_to_iceberg_glue_test/package.json index fef37ff0..37c58a5e 100644 --- a/getting-started-examples/07_external_kafka_iceberg_glue_test/package.json +++ b/getting-started-examples/06_kafka_to_iceberg_glue_test/package.json @@ -1,6 +1,6 @@ { "version": "1", - "enabled-engines": ["flink", "iceberg"], + "enabled-engines": ["flink", "iceberg", "kafka"], "script": { "main": "kafka-to-iceberg.sqrl" }, @@ -15,12 +15,15 @@ "connectors": { "iceberg": { "connector": "iceberg", - "warehouse": "${S3_WAREHOUSE_PATH}", + "warehouse": "s3://my-iceberg-table-test", "catalog-impl": "org.apache.iceberg.aws.glue.GlueCatalog", "io-impl": "org.apache.iceberg.aws.s3.S3FileIO", - "catalog-name": "NSCatalog", - "catalog-database": "nsdatabase", + "catalog-name": "mycatalog", + "catalog-database": "mydatabase", "write.upsert.enabled": "true" } + }, + "test-runner": { + "create-topics": ["contacts"] } } diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/README.md b/getting-started-examples/07_external_kafka_iceberg_glue_test/README.md deleted file mode 100644 index 2ac3c2de..00000000 --- a/getting-started-examples/07_external_kafka_iceberg_glue_test/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Kafka-to-Kafka with Avro using DataSQRL - -This project demonstrates how to use [DataSQRL](https://datasqrl.com) to build a streaming pipeline that: - -- This example demonstrates how to use kafka and glue -- We read from kafka topic and write data to s3 in iceberg glue format - -> [!IMPORTANT] -> To run this program you would need access to AWS S3 -> You should have a credentials file in your ~/.aws directory - -## 🐳 Running DataSQRL - -Run the following command from the project root where your `package.json` and SQRL scripts reside: -```bash -docker run -it --rm \ - -p 8888:8888 \ - -p 8081:8081 \ - -v $PWD:/build \ - -v ~/.aws:/root/.aws \ - -e AWS_REGION=us-east-1 \ - -e S3_WAREHOUSE_PATH=s3:///path/to/warehouse/ \ - datasqrl/cmd:0.7.1 run -c package.json -``` -> [!NOTE] -> We removed `-p 9092:9092` as we are using our own Kafka running locally on host machine now -> You also need to set `s3:///path/to/warehouse/` - -## Generate Data - -* Go to `data-generator` folder - * `python3 load_data.py ` -* To send Contact data -```bash -python3 load_data.py contacts.jsonl contact -``` - -## Output - -* Updated records should be generated in `enrichedcontact` table. diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/iceberg-sink/EnrichedContact.table.sql b/getting-started-examples/07_external_kafka_iceberg_glue_test/iceberg-sink/EnrichedContact.table.sql deleted file mode 100644 index 2b18e53e..00000000 --- a/getting-started-examples/07_external_kafka_iceberg_glue_test/iceberg-sink/EnrichedContact.table.sql +++ /dev/null @@ -1,10 +0,0 @@ -CREATE TABLE EnrichedContact ( - WATERMARK FOR last_updated AS last_updated - INTERVAL '1' SECOND -) WITH ( - 'connector' = 'iceberg', - 'catalog-name' = 'mycatalog', - 'catalog-type' = 'hadoop', - 'warehouse' = '${S3_WAREHOUSE_PATH}', - 'format-version' = '2' - 'aws.region' = 'us-east-1' -); diff --git a/getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-to-iceberg.sqrl b/getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-to-iceberg.sqrl deleted file mode 100644 index 6a833d9a..00000000 --- a/getting-started-examples/07_external_kafka_iceberg_glue_test/kafka-to-iceberg.sqrl +++ /dev/null @@ -1,8 +0,0 @@ -IMPORT kafka-source.Contact AS contacts; - -enrichedcontacts := SELECT - id, - firstname, - lastname, - last_updated -FROM contacts; diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/README.md b/getting-started-examples/07_schema_registry_kafka_to_kafka/README.md similarity index 97% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/README.md rename to getting-started-examples/07_schema_registry_kafka_to_kafka/README.md index 2233fb55..7fb48317 100644 --- a/getting-started-examples/08_schema_registry_kafka_to_kafka/README.md +++ b/getting-started-examples/07_schema_registry_kafka_to_kafka/README.md @@ -30,7 +30,7 @@ Run the following command from the project root where your `package.json` and SQ docker run -it --rm \ -p 8888:8888 \ -v $PWD:/build \ - datasqrl/cmd:0.7.1 run -c package.json + datasqrl/cmd:latest run -c package.json ``` ## Generate Data diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/contact.avsc b/getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/contact.avsc similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/contact.avsc rename to getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/contact.avsc diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/data.jsonl b/getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/data.jsonl similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/data.jsonl rename to getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/data.jsonl diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/send_kafka_avro_records.py b/getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/send_kafka_avro_records.py similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/data-generator/send_kafka_avro_records.py rename to getting-started-examples/07_schema_registry_kafka_to_kafka/data-generator/send_kafka_avro_records.py diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-sink/EnrichedContactAvro.table.sql b/getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-sink/EnrichedContactAvro.table.sql similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-sink/EnrichedContactAvro.table.sql rename to getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-sink/EnrichedContactAvro.table.sql diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-source/Contact.table.sql b/getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-source/Contact.table.sql similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-source/Contact.table.sql rename to getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-source/Contact.table.sql diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-to-kafka.sqrl b/getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-to-kafka.sqrl similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/kafka-to-kafka.sqrl rename to getting-started-examples/07_schema_registry_kafka_to_kafka/kafka-to-kafka.sqrl diff --git a/getting-started-examples/08_schema_registry_kafka_to_kafka/package.json b/getting-started-examples/07_schema_registry_kafka_to_kafka/package.json similarity index 100% rename from getting-started-examples/08_schema_registry_kafka_to_kafka/package.json rename to getting-started-examples/07_schema_registry_kafka_to_kafka/package.json diff --git a/getting-started-examples/README.md b/getting-started-examples/README.md index 87a490b4..6be35bcc 100644 --- a/getting-started-examples/README.md +++ b/getting-started-examples/README.md @@ -5,16 +5,15 @@ Each example demonstrates a real-world data pipeline pattern using technologies ## πŸ“ Example Index -| Folder | Description | -|----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| -| [**01\_kafka\_to\_console**](./01_kafka_to_console) | Reads from Kafka and outputs to console. Simple setup for basic Kafka ingestion testing. | -| [**02\_kafka\_to\_kafka**](./02_kafka_to_kafka) | Reads from one Kafka topic and writes to another. Useful for learning basic Kafka transformations. | -| [**03\_two\_streams\_kafka\_to\_kafka**](./03_two_streams_kafka_to_kafka) | Combines two Kafka topics, performs stream joins/enrichment, and writes to Kafka. | -| [**04\_two\_streams\_external\_kafka\_to\_kafka**](./04_two_streams_external_kafka_to_kafka) | Simulates a more decoupled version of multi-stream joins. Good for external integration scenarios. | -| [**05\_file\_iceberg\_test**](./05_file_iceberg_test) | Writes data to local file-based Iceberg tables. Great for learning Iceberg without cloud dependencies. | -| [**06\_external\_kafka\_iceberg\_test**](./06_external_kafka_iceberg_test) | Kafka to Iceberg using a local warehouse directory. Minimal config, good for staging/testing. | -| [**07\_external\_kafka\_iceberg\_glue\_test**](./07_external_kafka_iceberg_glue_test) | Kafka to Iceberg using AWS Glue as catalog and S3 as storage. | -| [**08\_schema\_registry\_kafka\_to\_kafka**](./08_schema_registry_kafka_to_kafka) | Kafka-to-Kafka pipeline using Confluent Schema Registry for Avro schema management. | +| Folder | Description | +|-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| +| [**01\_kafka\_to\_console**](./01_kafka_to_console) | Reads from Kafka and outputs to console. Simple setup for basic Kafka ingestion testing. | +| [**02\_kafka\_to\_kafka**](./02_kafka_to_kafka) | Reads from one Kafka topic and writes to another. Useful for learning basic Kafka transformations. | +| [**03\_kafka\_join**](./03_kafka_join) | Combines two Kafka topics, performs stream joins/enrichment, and writes to Kafka. | +| [**04\_file\_to\_iceberg\_test**](./04_file_to_iceberg_test) | Writes data to local file-based Iceberg tables. Great for learning Iceberg without cloud dependencies. | +| [**05\_external\_kafka\_iceberg\_test**](./05_kafka_to_iceberg_local_test) | Kafka to Iceberg using a local warehouse directory. Minimal config, good for staging/testing. | +| [**06\_kafka\_iceberg\_glue\_test**](./06_kafka_to_iceberg_glue_test) | Kafka to Iceberg using AWS Glue as catalog and S3 as storage. | +| [**07\_schema\_registry\_kafka\_to\_kafka**](./07_schema_registry_kafka_to_kafka) | Kafka-to-Kafka pipeline using Confluent Schema Registry for Avro schema management. | Each folder includes: @@ -30,7 +29,7 @@ Each folder includes: * [Docker](https://docs.docker.com/get-docker/) installed * Optional: Kafka and Schema Registry (locally or in cloud) -* For Glue/S3 integration: AWS CLI credentials (`~/.aws`) mounted in Docker +* For Glue/S3 integration: AWS CLI access key ### Run Any Example @@ -39,11 +38,12 @@ docker run -it --rm \ -p 8888:8888 \ -p 8081:8081 \ -v $PWD:/build \ - datasqrl/cmd:0.7.1 run -c package.json + datasqrl/cmd:latest run -c package.json ``` #### Persistent Data -To preserve the internally persisted data after the Docker container stopped running, extend with `/data` mount: +To preserve the internally persisted data (for Postgres, Redpanda, and Flink) after the Docker container +stopped running, extend with `/data` mount: ```bash docker run -it --rm \ @@ -51,22 +51,22 @@ docker run -it --rm \ -p 8081:8081 \ -v $PWD:/build \ -v $PWD/data:/data \ - datasqrl/cmd:0.7.1 run -c package.json + datasqrl/cmd:latest run -c package.json ``` #### Mount External Services -If using AWS or external services, extend with environment mounts: +If using AWS pass the necessary environment variables defined by the AWS SDK:: ```bash docker run -it --rm \ -p 8888:8888 \ -p 8081:8081 \ -v $PWD:/build \ - -v ~/.aws:/root/.aws \ - -e AWS_REGION=us-east-1 \ - -e S3_WAREHOUSE_PATH=s3://your-bucket/path/ \ - datasqrl/cmd:0.7.1 run -c package.json + -e AWS_ACCESS_KEY_ID="" \ + -e AWS_SECRET_ACCESS_KEY=" [!IMPORTANT] +> Make sure you pass the `SNOWFLAKE_JDBC_URL` environment variable to the container, that should be set to the complete JDBC URL. +> For example: `jdbc:snowflake://abc12345.eu-central-1.snowflakecomputing.com/?user=MYUSER&password=MYPASSWORDwarehouse=MYWH&db=MYDB&schema=MYSCHEMA&role=MYROLE` + +> [!IMPORTANT] +> You must set a proper AWS access key, and a valid S3 bucket as `warehouse` in [study_analytics_package_snowflake.json](study_analytics_package_snowflake.json), +> or create the `my-iceberg-warehouse` bucket in the given AWS account. diff --git a/healthcare-study/healthcare-study-analytics/study_analytics_package_snowflake.json b/healthcare-study/healthcare-study-analytics/study_analytics_package_snowflake.json index 7a5b7262..67eb8a31 100644 --- a/healthcare-study/healthcare-study-analytics/study_analytics_package_snowflake.json +++ b/healthcare-study/healthcare-study-analytics/study_analytics_package_snowflake.json @@ -6,22 +6,22 @@ }, "engines": { "flink": { - "config" : { - "table.exec.source.idle-timeout": "1 s" + "config": { + "table.exec.source.idle-timeout": "500 ms" } }, - "snowflake" : { + "snowflake": { "catalog-name": "MyCatalog", "external-volume": "MyNewVolume", - "url": "jdbc:snowflake://${SNOWFLAKE_ID}.snowflakecomputing.com/?user=${SNOWFLAKE_USER}&password=${SNOWFLAKE_PASSWORD}&warehouse=COMPUTE_WH&db=MYSNOWFLAKEDB&schema=public&disableSslHostnameVerification=true" + "url": "${SNOWFLAKE_JDBC_URL}" } }, - "connectors" : { - "iceberg" : { - "warehouse":"s3://my-iceberg-table-test", - "catalog-impl":"org.apache.iceberg.aws.glue.GlueCatalog", - "io-impl":"org.apache.iceberg.aws.s3.S3FileIO", - "catalog-name": "mydatabase", + "connectors": { + "iceberg": { + "warehouse": "s3://my-iceberg-warehouse", + "catalog-impl": "org.apache.iceberg.aws.glue.GlueCatalog", + "io-impl": "org.apache.iceberg.aws.s3.S3FileIO", + "catalog-name": "mycatalog", "catalog-database": "mydatabase" } } diff --git a/healthcare-study/healthcare-study-analytics/study_analytics_package_test.json b/healthcare-study/healthcare-study-analytics/study_analytics_package_test.json index 076fdbe4..2d8d3f82 100644 --- a/healthcare-study/healthcare-study-analytics/study_analytics_package_test.json +++ b/healthcare-study/healthcare-study-analytics/study_analytics_package_test.json @@ -6,15 +6,15 @@ }, "engines": { "flink": { - "config" : { - "table.exec.source.idle-timeout": "1 s" + "config": { + "table.exec.source.idle-timeout": "500 ms" } } }, - "connectors" : { - "iceberg" : { - "warehouse":"warehouse", - "catalog-type":"hadoop", + "connectors": { + "iceberg": { + "warehouse": "warehouse", + "catalog-type": "hadoop", "catalog-name": "mydatabase" } }, diff --git a/iot-sensor-metrics/README.md b/iot-sensor-metrics/README.md index 2ba8bf27..0ffd5a27 100644 --- a/iot-sensor-metrics/README.md +++ b/iot-sensor-metrics/README.md @@ -9,7 +9,7 @@ To run the API, execute ```bash docker run -it -p 8081:8081 -p 8888:8888 --rm -v $PWD:/build datasqrl/cmd:latest run -c sensors_package_api.json -``s` +``` To check that the GraphQL API is running properly, [open GraphiQL](http://localhost:8888/graphiql/) to access the API.