Skip to content

Commit 37aba02

Browse files
committed
add Glue RedShift crawler sample
1 parent 78098d0 commit 37aba02

File tree

3 files changed

+133
-0
lines changed

3 files changed

+133
-0
lines changed

glue-redshift-crawler/Makefile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
export AWS_ACCESS_KEY_ID ?= test
2+
export AWS_SECRET_ACCESS_KEY ?= test
3+
export AWS_DEFAULT_REGION = us-east-1
4+
5+
usage: ## Show this help
6+
@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
7+
8+
install: ## Install dependencies
9+
@which localstack || pip install localstack
10+
@which awslocal || pip install awscli-local
11+
docker pull localstack/bigdata
12+
13+
run: install ## Prepare environment and run sample
14+
./run.sh
15+
16+
test-ci:
17+
make install && make run
18+
19+
.PHONY: usage install run test-ci

glue-redshift-crawler/README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# LocalStack Demo: Glue Crawler RedShift Integration (JDBC)
2+
3+
Simple demo application illustrating the use of AWS Glue Crawler to populate the Glue
4+
5+
## Prerequisites
6+
7+
* LocalStack
8+
* Docker
9+
* `make`
10+
* [`awslocal`](https://github.com/localstack/awscli-local)
11+
12+
## Installing
13+
To install the dependencies:
14+
```
15+
make install
16+
```
17+
18+
## App Details
19+
This example shows how to use AWS Glue Crawler to populate the Glue metadata store with the table schema of RedShift database tables.
20+
21+
The following steps are executed when running the sample:
22+
- Create a RedShift cluster and database.
23+
- Create a Glue connection, specifying the JDBC connection properties for the RedShift database.
24+
- Create a Glue database to store the table metadata in.
25+
- Create a Crawler to populate the Glue database with the RedShift table metadata using the Glue connection.
26+
- Create a new table in the RedShift database.
27+
- Run the Crawler.
28+
- Check out the resulting table metadata.
29+
30+
## Running
31+
Make sure that LocalStack is started:
32+
```
33+
LOCALSTACK_API_KEY=... DEBUG=1 localstack start
34+
```
35+
36+
The following command executes the sample:
37+
38+
```
39+
make run
40+
```
41+
42+
## License
43+
44+
This sample code is available under the Apache 2.0 license.

glue-redshift-crawler/run.sh

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
set -e -x
3+
4+
REDSHIFT_CLUSTER_IDENTIFIER="redshiftcluster"
5+
REDSHIFT_SCHEMA_NAME="public"
6+
REDSHIFT_DATABASE_NAME="db1"
7+
REDSHIFT_TABLE_NAME="sales"
8+
REDSHIFT_USERNAME="crawlertestredshiftusername"
9+
REDSHIFT_PASSWORD="crawlertestredshiftpassword"
10+
GLUE_DATABASE_NAME="gluedb"
11+
GLUE_CONNECTION_NAME="glueconnection"
12+
GLUE_CRAWLER_NAME="gluecrawler"
13+
14+
# Tear-down function to cleanup on exit
15+
function cleanup() {
16+
echo ""
17+
echo "(Cleanup) Deleting Redshift cluster."
18+
awslocal redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER 2> /dev/null || true
19+
echo "(Cleanup) Deleting Glue database."
20+
awslocal glue delete-database --name $GLUE_DATABASE_NAME 2> /dev/null || true
21+
echo "(Cleanup) Deleting Glue connection."
22+
awslocal glue delete-connection --connection-name $GLUE_CONNECTION_NAME 2> /dev/null || true
23+
echo "(Cleanup) Deleting Glue crawler."
24+
awslocal glue delete-crawler --name $GLUE_CRAWLER_NAME 2> /dev/null || true
25+
}
26+
trap cleanup EXIT
27+
28+
wait () {
29+
set -e -x
30+
command=$1
31+
field=$2
32+
expected=$3
33+
current=$($command | jq -r $field)
34+
while [ "$current" != "$expected" ]; do
35+
sleep 5
36+
echo "Waiting for state change. Current: $current / Expected: $expected"
37+
current=$($command | jq -r $field)
38+
done
39+
}
40+
41+
# Cleanup
42+
cleanup
43+
44+
# Create the redshift cluster
45+
echo "Creating Redshift cluster..."
46+
awslocal redshift create-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --db-name $REDSHIFT_DATABASE_NAME --master-username $REDSHIFT_USERNAME --master-user-password $REDSHIFT_PASSWORD --node-type n1
47+
wait "awslocal redshift describe-clusters --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER" ".Clusters[0].ClusterStatus" "available"
48+
REDSHIFT_URL=$(awslocal redshift describe-clusters --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER | jq -r '(.Clusters[0].Endpoint.Address) + ":" + (.Clusters[0].Endpoint.Port|tostring)')
49+
50+
# Create the Glue database, connection, and crawler
51+
echo "Creating Glue db, connection, and crawler..."
52+
awslocal glue create-database --database-input "{\"Name\": \"$GLUE_DATABASE_NAME\"}"
53+
awslocal glue create-connection --connection-input "{\"Name\":\"$GLUE_CONNECTION_NAME\", \"ConnectionType\": \"JDBC\", \"ConnectionProperties\": {\"USERNAME\": \"$REDSHIFT_USERNAME\", \"PASSWORD\": \"$REDSHIFT_PASSWORD\", \"JDBC_CONNECTION_URL\": \"jdbc:redshift://$REDSHIFT_URL/$REDSHIFT_DATABASE_NAME\"}}"
54+
awslocal glue create-crawler --name $GLUE_CRAWLER_NAME --database-name $GLUE_DATABASE_NAME --targets "{\"JdbcTargets\": [{\"ConnectionName\": \"$GLUE_CONNECTION_NAME\", \"Path\": \"$REDSHIFT_DATABASE_NAME/%/$REDSHIFT_TABLE_NAME\"}]}" --role r1
55+
56+
# Create a table in the redshift DB
57+
echo "Creating table in Redshift DB..."
58+
REDSHIFT_STATEMENT_ID=$(awslocal redshift-data execute-statement --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --database $REDSHIFT_DATABASE_NAME --sql \
59+
"create table $REDSHIFT_TABLE_NAME(salesid integer not null, listid integer not null, sellerid integer not null, buyerid integer not null, eventid integer not null, dateid smallint not null, qtysold smallint not null, pricepaid decimal(8,2), commission decimal(8,2), saletime timestamp)" | jq -r .Id)
60+
wait "awslocal redshift-data describe-statement --id $REDSHIFT_STATEMENT_ID" ".Status" "FINISHED"
61+
62+
# Run the crawler
63+
echo "Starting Crawler..."
64+
awslocal glue start-crawler --name $GLUE_CRAWLER_NAME
65+
wait "awslocal glue get-crawler --name $GLUE_CRAWLER_NAME" ".Crawler.State" "READY"
66+
67+
echo "Getting Glue table..."
68+
awslocal glue get-table --database-name $GLUE_DATABASE_NAME --name "${REDSHIFT_DATABASE_NAME}_${REDSHIFT_SCHEMA_NAME}_${REDSHIFT_TABLE_NAME}"
69+
70+
echo "Done."

0 commit comments

Comments
 (0)