Skip to content

Commit d16a670

Browse files
authored
[sdlf-utils] fix legislators example script (#541)
1 parent 3ba3fb4 commit d16a670

File tree

3 files changed

+132
-19
lines changed

3 files changed

+132
-19
lines changed
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Dataset Example
2+
3+
This example demonstrates how to hydrate the data lake with a sample dataset for a specific team.
4+
5+
## Usage
6+
7+
```bash
8+
./deploy.sh -t <team-name> -d <dataset-name> -r <region> [-p <aws-profile>]
9+
```
10+
11+
### Parameters
12+
13+
- `-t <team-name>` (required): Team name (2-12 characters, lowercase letters and numbers only)
14+
- `-d <dataset-name>` (required): Dataset name (2-12 characters, lowercase letters and numbers only)
15+
- `-r <region>` (required): AWS region (e.g., us-east-1, eu-west-1)
16+
- `-p <aws-profile>` (optional): AWS profile to use
17+
- `-h`: Show help message
18+
19+
### Examples
20+
21+
```bash
22+
# Deploy legislators dataset for team "engineering" in us-east-1
23+
./deploy.sh -t engineering -d legislators -r us-east-1
24+
25+
# Deploy customers dataset for team "analytics" in eu-west-1 with specific AWS profile
26+
./deploy.sh -t analytics -d customers -r eu-west-1 -p my-profile
27+
```
28+
29+
## What it creates
30+
31+
1. **Glue Job**: `sdlf-<team-name>-<dataset-name>-glue-job`
32+
2. **IAM Role**: `sdlf-<team-name>-<dataset-name>-glue-role`
33+
3. **CloudFormation Stack**: `sdlf-<team-name>-<dataset-name>-glue-job`
34+
4. **S3 Data**: Uploads sample data to `s3://<raw-bucket>/<team-name>/<dataset-name>/`
35+
36+
## Data Processing
37+
38+
The Glue job processes three JSON files:
39+
- `persons.json``persons/` (Parquet)
40+
- `memberships.json``memberships/` (Parquet)
41+
- `organizations.json``organizations/` (Parquet)
42+
- Creates a joined `history/` dataset partitioned by organization name
43+
44+
## Prerequisites
45+
46+
- AWS CLI configured
47+
- SDLF framework deployed in the specified region
48+
- Appropriate IAM permissions
49+
50+
## Example Resource Names
51+
52+
For team "engineering" and dataset "legislators":
53+
- Glue Job: `sdlf-engineering-legislators-glue-job`
54+
- IAM Role: `sdlf-engineering-legislators-glue-role`
55+
- S3 Path: `s3://<raw-bucket>/engineering/legislators/`
Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,81 @@
11
#!/bin/bash
22
pflag=false
3+
tflag=false
4+
dflag=false
5+
rflag=false
36

47
DIRNAME=$(dirname "$0")
58

69
usage () { echo "
710
-h -- Opens up this help message
811
-p -- Name of the AWS profile to use
12+
-t -- Team name (required)
13+
-d -- Dataset name (required)
14+
-r -- AWS region (required)
915
"; }
10-
options=':p:h'
16+
options=':p:t:d:r:h'
1117
while getopts "$options" option
1218
do
1319
case "$option" in
1420
p ) pflag=true; PROFILE=$OPTARG;;
21+
t ) tflag=true; TEAM=$OPTARG;;
22+
d ) dflag=true; DATASET=$OPTARG;;
23+
r ) rflag=true; REGION=$OPTARG;;
1524
h ) usage; exit;;
1625
\? ) echo "Unknown option: -$OPTARG" >&2; exit 1;;
1726
: ) echo "Missing option argument for -$OPTARG" >&2; exit 1;;
1827
* ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;;
1928
esac
2029
done
2130

31+
if ! "$tflag"
32+
then
33+
echo "Team name is required. Use -t <team-name>" >&2
34+
usage
35+
exit 1
36+
fi
37+
38+
if ! "$dflag"
39+
then
40+
echo "Dataset name is required. Use -d <dataset-name>" >&2
41+
usage
42+
exit 1
43+
fi
44+
45+
if ! "$rflag"
46+
then
47+
echo "AWS region is required. Use -r <region>" >&2
48+
usage
49+
exit 1
50+
fi
51+
2252
if "$pflag"
2353
then
2454
echo "using AWS profile $PROFILE..." >&2
2555
fi
26-
REGION=$(aws configure get region ${PROFILE:+--profile "$PROFILE"})
2756

28-
ARTIFACTS_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rArtifactsBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
57+
echo "using team: $TEAM" >&2
58+
echo "using dataset: $DATASET" >&2
59+
echo "using region: $REGION" >&2
60+
61+
ARTIFACTS_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/SDLF2/S3/ArtifactsBucket" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
2962
aws s3 cp "$DIRNAME/scripts/legislators-glue-job.py" "s3://$ARTIFACTS_BUCKET/artifacts/" ${PROFILE:+--profile "$PROFILE"}
3063

31-
mkdir "$DIRNAME"/output
64+
mkdir -p "$DIRNAME"/output
3265

33-
function send_legislators()
66+
function send_data()
3467
{
3568
ORIGIN="$DIRNAME/data/"
3669

37-
RAW_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/storage/rRawBucket/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
38-
KMS_KEY=$(aws --region "$REGION" ssm get-parameter --name "/sdlf/dataset/rKMSDataKey/dev" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
70+
RAW_BUCKET=$(aws --region "$REGION" ssm get-parameter --name "/SDLF2/S3/RawBucket" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
71+
KMS_KEY=$(aws --region "$REGION" ssm get-parameter --name "/SDLF/KMS/$TEAM/DataKeyId" --query "Parameter.Value" --output text ${PROFILE:+--profile "$PROFILE"})
3972

4073
S3_DESTINATION=s3://$RAW_BUCKET/
4174
COUNT=0
4275
for FILE in "$ORIGIN"/*.json;
4376
do
4477
(( COUNT++ )) || true
45-
aws s3 cp "$FILE" "${S3_DESTINATION}legislators/" --sse aws:kms --sse-kms-key-id "$KMS_KEY" ${PROFILE:+--profile "$PROFILE"}
78+
aws s3 cp "$FILE" "${S3_DESTINATION}${TEAM}/${DATASET}/" --sse aws:kms --sse-kms-key-id "$KMS_KEY" ${PROFILE:+--profile "$PROFILE"}
4679
echo "transferred $COUNT files"
4780
done
4881
}
@@ -58,14 +91,15 @@ aws cloudformation package --template-file "$DIRNAME"/scripts/legislators-glue-j
5891
${PROFILE:+--profile "$PROFILE"} \
5992
--output-template-file "$DIRNAME"/output/packaged-template.yaml
6093

61-
STACK_NAME="sdlf-legislators-glue-job"
94+
STACK_NAME="sdlf-${TEAM}-${DATASET}-glue-job"
6295
aws cloudformation deploy \
6396
--s3-bucket "$ARTIFACTS_BUCKET" --s3-prefix sdlf-utils \
6497
--stack-name "$STACK_NAME" \
6598
--template-file "$DIRNAME"/output/packaged-template.yaml \
66-
--tags Framework=sdlf \
99+
--parameter-overrides pTeamName="$TEAM" pDatasetName="$DATASET" \
100+
--tags Framework=sdlf Team="$TEAM" Dataset="$DATASET" \
67101
--capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \
68102
--region "$REGION" \
69103
${PROFILE:+--profile "$PROFILE"} || exit 1
70104

71-
send_legislators
105+
send_data

sdlf-utils/workshop-examples/legislators/scripts/legislators-glue-job.yaml

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,24 @@ AWSTemplateFormatVersion: 2010-09-09
22
Description: Glue Job Sample
33

44
Parameters:
5+
pTeamName:
6+
Type: String
7+
Description: Team name for resource naming
8+
AllowedPattern: "[a-z0-9]{2,12}"
9+
ConstraintDescription: Must be between 2 and 12 characters, lowercase letters and numbers only
10+
pDatasetName:
11+
Type: String
12+
Description: Dataset name for resource naming
13+
AllowedPattern: "[a-z0-9]{2,12}"
14+
ConstraintDescription: Must be between 2 and 12 characters, lowercase letters and numbers only
515
pPipelineDeploymentInstance:
616
Type: String
717
Description: specific pipeline stage deployment instance this job is for
818
Default: mainB
919
pArtifactsBucket:
1020
Description: S3 bucket used to store artifacts (from CICD or generated by data pipelines)
1121
Type: AWS::SSM::Parameter::Value<String>
12-
Default: /sdlf/storage/rArtifactsBucket/dev
22+
Default: /SDLF2/S3/ArtifactsBucket
1323
pEnableVpc:
1424
Description: Deploy SDLF resources in a VPC
1525
Type: AWS::SSM::Parameter::Value<String>
@@ -22,6 +32,7 @@ Resources:
2232
rGlueRole:
2333
Type: AWS::IAM::Role
2434
Properties:
35+
RoleName: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-role
2536
Path: /service-role/
2637
AssumeRolePolicyDocument:
2738
Version: 2012-10-17
@@ -37,7 +48,7 @@ Resources:
3748
- !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonS3FullAccess
3849
- !Sub arn:${AWS::Partition}:iam::aws:policy/CloudWatchLogsFullAccess
3950
Policies:
40-
- PolicyName: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job
51+
- PolicyName: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-policy
4152
PolicyDocument:
4253
Version: 2012-10-17
4354
Statement:
@@ -50,17 +61,17 @@ Resources:
5061
- kms:GenerateDataKey*
5162
- kms:ReEncrypt*
5263
Resource:
53-
- "{{resolve:ssm:/sdlf/dataset/rKMSInfraKey/dev:1}}"
54-
- "{{resolve:ssm:/sdlf/dataset/rKMSDataKey/dev:1}}"
55-
- "{{resolve:ssm:/sdlf/storage/rKMSKey/dev:1}}"
64+
- !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/InfraKeyId:1}}"
65+
- !Sub "{{resolve:ssm:/SDLF/KMS/${pTeamName}/DataKeyId:1}}"
66+
- !Sub "{{resolve:ssm:/SDLF2/KMS/KeyArn:1}}"
5667

5768
rGlueJob:
5869
Type: AWS::Glue::Job
5970
Properties:
6071
Command:
6172
Name: glueetl
6273
PythonVersion: "3"
63-
ScriptLocation: !Sub s3://${pArtifactsBucket}/artifacts/${pPipelineDeploymentInstance}-glue-job.py
74+
ScriptLocation: !Sub s3://${pArtifactsBucket}/artifacts/legislators-glue-job.py
6475
DefaultArguments: !If
6576
- RunInVpc
6677
-
@@ -75,6 +86,19 @@ Resources:
7586
MaxRetries: 0
7687
MaxCapacity: 2.0
7788
GlueVersion: "4.0"
78-
Name: !Sub sdlf-${pPipelineDeploymentInstance}-glue-job
79-
SecurityConfiguration: "{{resolve:ssm:/sdlf/dataset/rGlueSecurityConfiguration/dev:1}}"
89+
Name: !Sub sdlf-${pTeamName}-${pDatasetName}-glue-job
90+
SecurityConfiguration: !Sub "{{resolve:ssm:/SDLF/Glue/${pTeamName}/SecurityConfigurationId:1}}"
8091
Role: !Ref rGlueRole
92+
93+
Outputs:
94+
oGlueJobName:
95+
Description: Name of the Glue job
96+
Value: !Ref rGlueJob
97+
Export:
98+
Name: !Sub ${AWS::StackName}-glue-job-name
99+
100+
oGlueRoleArn:
101+
Description: ARN of the Glue job role
102+
Value: !GetAtt rGlueRole.Arn
103+
Export:
104+
Name: !Sub ${AWS::StackName}-glue-role-arn

0 commit comments

Comments
 (0)