Skip to content

Commit 2f43231

Browse files
authored
fix broken Glue ETL jobs sample (#154)
1 parent 3a2c57d commit 2f43231

File tree

2 files changed

+22
-20
lines changed

2 files changed

+22
-20
lines changed

glue-etl-jobs/job.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ def main():
6666
m_df = dfc.select(df_name)
6767
print("Writing to Postgres table:", df_name)
6868
glueContext.write_dynamic_frame.from_jdbc_conf(
69-
frame = m_df, catalog_connection = "c1",
69+
frame = m_df,
70+
# should be same as CONNECTION_NAME in the bash script
71+
catalog_connection = "glue-etl-cluster1-connection",
7072
connection_options = {"dbtable": df_name, "database": "test"},
7173
redshift_tmp_dir = redshift_temp_dir)
7274

glue-etl-jobs/run.sh

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,45 +3,45 @@
33
BUCKET=glue-pyspark-test
44
JOB_NAME=test-job1
55
S3_URL=s3://$BUCKET/job.py
6+
CLUSTER_IDENTIFIER=glue-etl-cluster1
7+
CONNECTION_NAME=glue-etl-cluster1-connection
68

79
echo Putting PySpark script to test S3 bucket ...
8-
awslocal s3 mb s3://$BUCKET
9-
awslocal s3 cp job.py $S3_URL
10-
awslocal s3 mb s3://glue-sample-target
10+
awslocal s3 mb "s3://$BUCKET"
11+
awslocal s3 cp "job.py" "$S3_URL"
12+
awslocal s3 mb "s3://glue-sample-target"
1113

12-
cluster_identifier=glue_etl_cluster1
14+
awslocal rds create-db-cluster --db-cluster-identifier "$CLUSTER_IDENTIFIER" --engine aurora-postgresql --database-name test
1315

14-
awslocal rds create-db-cluster --db-cluster-identifier $cluster_identifier --engine aurora-postgresql --database-name test
15-
16-
db_port=$(awslocal rds describe-db-clusters --db-cluster-identifier $cluster_identifier) | jq -r '.DBClusters[0].Port'
16+
db_port=$(awslocal rds describe-db-clusters --db-cluster-identifier "$CLUSTER_IDENTIFIER" | jq -r '.DBClusters[0].Port')
1717
echo Using local RDS database on port $db_port ...
1818

1919
echo Creating Glue databases and tables ...
2020
awslocal glue create-database --database-input '{"Name": "legislators"}'
2121
awslocal glue create-table --database legislators \
22-
--table-input '{"Name":"memberships_json", "Parameters": {"connectionName": "$cluster_identifier"}, "StorageDescriptor": {"Location": "test.memberships"}}'
22+
--table-input '{"Name": "memberships_json", "Parameters": {"connectionName": "'$CONNECTION_NAME'"}, "StorageDescriptor": {"Location": "test.memberships"}}'
2323
awslocal glue create-table --database legislators \
24-
--table-input '{"Name":"persons_json", "Parameters": {"connectionName": "$cluster_identifier"}, "StorageDescriptor": {"Location": "test.persons"}}'
24+
--table-input '{"Name": "persons_json", "Parameters": {"connectionName": "'$CONNECTION_NAME'"}, "StorageDescriptor": {"Location": "test.persons"}}'
2525
awslocal glue create-table --database legislators \
26-
--table-input '{"Name":"organizations_json", "Parameters": {"connectionName": "$cluster_identifier"}, "StorageDescriptor": {"Location": "test.organizations"}}'
26+
--table-input '{"Name": "organizations_json", "Parameters": {"connectionName": "'$CONNECTION_NAME'"}, "StorageDescriptor": {"Location": "test.organizations"}}'
2727
awslocal glue create-connection \
28-
--connection-input '{"Name":"$cluster_identifier", "ConnectionType": "JDBC", "ConnectionProperties": {"USERNAME": "test", "PASSWORD": "test", "JDBC_CONNECTION_URL": "jdbc:postgresql://localhost.localstack.cloud:'$db_port'"}}'
28+
--connection-input '{"Name": "'$CONNECTION_NAME'", "ConnectionType": "JDBC", "ConnectionProperties": {"USERNAME": "test", "PASSWORD": "test", "JDBC_CONNECTION_URL": "jdbc:postgresql://localhost.localstack.cloud:'$db_port'"}}'
2929

3030
secret=$(awslocal secretsmanager create-secret --name mysecret --secret-string "12345678" | jq -r ".ARN")
3131

3232
echo Creating Postgres database tables with data ...
33-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS persons(id varchar, name varchar)'
34-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS organizations(org_id varchar, org_name varchar)'
35-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS memberships(person_id varchar, organization_id varchar)'
36-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql "insert into persons(id, name) VALUES('p1', 'person 1')"
37-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql "insert into organizations(org_id, org_name) VALUES('o1', 'org1')"
38-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql "insert into memberships(person_id, organization_id) VALUES('p1', 'o1')"
39-
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$cluster_identifier --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS hist_root(id varchar, name varchar, org_id varchar, org_name varchar, person_id varchar, organization_id varchar)'
33+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS persons(id varchar, name varchar)'
34+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS organizations(org_id varchar, org_name varchar)'
35+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS memberships(person_id varchar, organization_id varchar)'
36+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql "insert into persons(id, name) VALUES('p1', 'person 1')"
37+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql "insert into organizations(org_id, org_name) VALUES('o1', 'org1')"
38+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql "insert into memberships(person_id, organization_id) VALUES('p1', 'o1')"
39+
awslocal rds-data execute-statement --resource-arn arn:aws:rds:us-east-1:000000000000:cluster:$CLUSTER_IDENTIFIER --secret-arn $secret --sql 'CREATE TABLE IF NOT EXISTS hist_root(id varchar, name varchar, org_id varchar, org_name varchar, person_id varchar, organization_id varchar)'
4040

4141
echo Starting Glue job from PySpark script ...
4242
awslocal glue create-job --name $JOB_NAME --role r1 \
4343
--command '{"Name": "pythonshell", "ScriptLocation": "'$S3_URL'"}' \
44-
--connections '{"Connections": ["$cluster_identifier"]}'
44+
--connections '{"Connections": ["'$CLUSTER_IDENTIFIER'"]}'
4545
run_id=$(awslocal glue start-job-run --job-name $JOB_NAME | jq -r .JobRunId)
4646

4747
state=$(awslocal glue get-job-run --job-name $JOB_NAME --run-id $run_id | jq -r .JobRun.JobRunState)

0 commit comments

Comments
 (0)