From 9c9f38850e5bb6ead1c6a5d71c4d86f514ee5b9c Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Wed, 7 May 2025 23:52:50 -0600 Subject: [PATCH 01/13] add bucket listener, lambda handler --- .gitignore | 4 ++ infra/README.md | 58 ++++++++++++++++++ infra/app.py | 28 +++++++++ infra/aws/__init__.py | 0 infra/aws/fiboa_sda.py | 34 +++++++++++ infra/cdk.json | 91 +++++++++++++++++++++++++++++ infra/lambda-image/Dockerfile | 9 +++ infra/lambda-image/main.py | 107 ++++++++++++++++++++++++++++++++++ infra/requirements-dev.txt | 1 + infra/requirements.txt | 2 + 10 files changed, 334 insertions(+) create mode 100644 infra/README.md create mode 100644 infra/app.py create mode 100644 infra/aws/__init__.py create mode 100644 infra/aws/fiboa_sda.py create mode 100644 infra/cdk.json create mode 100644 infra/lambda-image/Dockerfile create mode 100644 infra/lambda-image/main.py create mode 100644 infra/requirements-dev.txt create mode 100644 infra/requirements.txt diff --git a/.gitignore b/.gitignore index 0a19790..3d2e3cd 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,7 @@ cython_debug/ # PyPI configuration file .pypirc + +# CDK asset staging directory +.cdk.staging +cdk.out diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..c53f0b5 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,58 @@ + +# Welcome to your CDK Python project! + +This is a blank project for CDK development with Python. + +The `cdk.json` file tells the CDK Toolkit how to execute your app. + +This project is set up like a standard Python project. The initialization +process also creates a virtualenv within this project, stored under the `.venv` +directory. To create the virtualenv it assumes that there is a `python3` +(or `python` for Windows) executable in your path with access to the `venv` +package. If for any reason the automatic creation of the virtualenv fails, +you can create the virtualenv manually. + +To manually create a virtualenv on MacOS and Linux: + +``` +$ python3 -m venv .venv +``` + +After the init process completes and the virtualenv is created, you can use the following +step to activate your virtualenv. + +``` +$ source .venv/bin/activate +``` + +If you are a Windows platform, you would activate the virtualenv like this: + +``` +% .venv\Scripts\activate.bat +``` + +Once the virtualenv is activated, you can install the required dependencies. + +``` +$ pip install -r requirements.txt +``` + +At this point you can now synthesize the CloudFormation template for this code. + +``` +$ cdk synth +``` + +To add additional dependencies, for example other CDK libraries, just add +them to your `setup.py` file and rerun the `pip install -r requirements.txt` +command. + +## Useful commands + + * `cdk ls` list all stacks in the app + * `cdk synth` emits the synthesized CloudFormation template + * `cdk deploy` deploy this stack to your default AWS account/region + * `cdk diff` compare deployed stack with current state + * `cdk docs` open CDK documentation + +Enjoy! diff --git a/infra/app.py b/infra/app.py new file mode 100644 index 0000000..94e244a --- /dev/null +++ b/infra/app.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import os + +import aws_cdk as cdk + +from infra.aws.fiboa_sda import FiboaSdaStack + + +app = cdk.App() +FiboaSdaStack(app, "FiboaSdaStackTesting", + # If you don't specify 'env', this stack will be environment-agnostic. + # Account/Region-dependent features and context lookups will not work, + # but a single synthesized template can be deployed anywhere. + + # Uncomment the next line to specialize this stack for the AWS Account + # and Region that are implied by the current CLI configuration. + + #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), + + # Uncomment the next line if you know exactly what Account and Region you + # want to deploy the stack to. */ + + #env=cdk.Environment(account='123456789012', region='us-east-1'), + + # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html + ) + +app.synth() diff --git a/infra/aws/__init__.py b/infra/aws/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infra/aws/fiboa_sda.py b/infra/aws/fiboa_sda.py new file mode 100644 index 0000000..4aefbd1 --- /dev/null +++ b/infra/aws/fiboa_sda.py @@ -0,0 +1,34 @@ +import os + +from aws_cdk import ( + aws_lambda as _lambda, + aws_s3 as _s3, + aws_s3_notifications, + Stack, Duration +) +from constructs import Construct + +class FiboaSdaStack(Stack): + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + # Create a new container image + ecr_image = _lambda.EcrImageCode.from_asset_image( + directory=os.path.join(os.getcwd(), "lambda-image") + ) + + # create lambda function + function = _lambda.Function(self, "fiboa-s3-listener", + runtime=_lambda.Runtime.FROM_IMAGE, + handler=_lambda.Handler.FROM_IMAGE, + architecture=_lambda.Architecture.ARM_64, + timeout=Duration.seconds(10), code=ecr_image) + # create s3 bucket + s3 = _s3.Bucket(self, "fiboa-sda-testing") + + # create s3 notification for lambda function + notification = aws_s3_notifications.LambdaDestination(function) + + # assign notification for the s3 event type (ex: OBJECT_CREATED) + s3.add_event_notification(_s3.EventType.OBJECT_CREATED, notification, _s3.NotificationKeyFilter(prefix="fiboa/*")) \ No newline at end of file diff --git a/infra/cdk.json b/infra/cdk.json new file mode 100644 index 0000000..355dca3 --- /dev/null +++ b/infra/cdk.json @@ -0,0 +1,91 @@ +{ + "app": "python3 app.py", + "watch": { + "include": [ + "**" + ], + "exclude": [ + "README.md", + "cdk*.json", + "requirements*.txt", + "source.bat", + "**/__init__.py", + "**/__pycache__", + "tests" + ] + }, + "context": { + "@aws-cdk/aws-lambda:recognizeLayerVersion": true, + "@aws-cdk/core:checkSecretUsage": true, + "@aws-cdk/core:target-partitions": [ + "aws", + "aws-cn" + ], + "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, + "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, + "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, + "@aws-cdk/aws-iam:minimizePolicies": true, + "@aws-cdk/core:validateSnapshotRemovalPolicy": true, + "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, + "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, + "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, + "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, + "@aws-cdk/core:enablePartitionLiterals": true, + "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, + "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, + "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, + "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, + "@aws-cdk/aws-route53-patters:useCertificate": true, + "@aws-cdk/customresources:installLatestAwsSdkDefault": false, + "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, + "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, + "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, + "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, + "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, + "@aws-cdk/aws-redshift:columnId": true, + "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, + "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, + "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, + "@aws-cdk/aws-kms:aliasNameRef": true, + "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, + "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, + "@aws-cdk/aws-efs:denyAnonymousAccess": true, + "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, + "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, + "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, + "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, + "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, + "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, + "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, + "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, + "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, + "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, + "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, + "@aws-cdk/aws-eks:nodegroupNameAttribute": true, + "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, + "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, + "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false, + "@aws-cdk/aws-s3:keepNotificationInImportedBucket": false, + "@aws-cdk/aws-ecs:enableImdsBlockingDeprecatedFeature": false, + "@aws-cdk/aws-ecs:disableEcsImdsBlocking": true, + "@aws-cdk/aws-ecs:reduceEc2FargateCloudWatchPermissions": true, + "@aws-cdk/aws-dynamodb:resourcePolicyPerReplica": true, + "@aws-cdk/aws-ec2:ec2SumTImeoutEnabled": true, + "@aws-cdk/aws-appsync:appSyncGraphQLAPIScopeLambdaPermission": true, + "@aws-cdk/aws-rds:setCorrectValueForDatabaseInstanceReadReplicaInstanceResourceId": true, + "@aws-cdk/core:cfnIncludeRejectComplexResourceUpdateCreatePolicyIntrinsics": true, + "@aws-cdk/aws-lambda-nodejs:sdkV3ExcludeSmithyPackages": true, + "@aws-cdk/aws-stepfunctions-tasks:fixRunEcsTaskPolicy": true, + "@aws-cdk/aws-ec2:bastionHostUseAmazonLinux2023ByDefault": true, + "@aws-cdk/aws-route53-targets:userPoolDomainNameMethodWithoutCustomResource": true, + "@aws-cdk/aws-elasticloadbalancingV2:albDualstackWithoutPublicIpv4SecurityGroupRulesDefault": true, + "@aws-cdk/aws-iam:oidcRejectUnauthorizedConnections": true, + "@aws-cdk/core:enableAdditionalMetadataCollection": true, + "@aws-cdk/aws-lambda:createNewPoliciesWithAddToRolePolicy": false, + "@aws-cdk/aws-s3:setUniqueReplicationRoleName": true, + "@aws-cdk/aws-events:requireEventBusPolicySid": true, + "@aws-cdk/core:aspectPrioritiesMutating": true, + "@aws-cdk/aws-dynamodb:retainTableReplica": true, + "@aws-cdk/aws-stepfunctions:useDistributedMapResultWriterV2": true + } +} diff --git a/infra/lambda-image/Dockerfile b/infra/lambda-image/Dockerfile new file mode 100644 index 0000000..8895891 --- /dev/null +++ b/infra/lambda-image/Dockerfile @@ -0,0 +1,9 @@ +FROM public.ecr.aws/lambda/python:3.11 + +WORKDIR ${LAMBDA_TASK_ROOT} + +COPY main.py . +RUN pip install slack-sdk==3.35.0 requests==2.32.2 tenacity==9.0.0 + +# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) +CMD [ "main.handler" ] diff --git a/infra/lambda-image/main.py b/infra/lambda-image/main.py new file mode 100644 index 0000000..4593808 --- /dev/null +++ b/infra/lambda-image/main.py @@ -0,0 +1,107 @@ +import os +import logging +import random +from urllib.parse import urljoin + +import requests +from slack_sdk import WebhookClient +from slack_sdk.http_retry import RateLimitErrorRetryHandler +from tenacity import retry, wait_exponential + +logger = logging.getLogger() +logger.setLevel("INFO") + +SLACK_WEBHOOK_URL = os.getenv( + "SLACK_APP_URL", + None, +) +SOURCE_COOP_URL = os.getenv("SOURCE_COOP_URL", "https://source.coop") +EMOJIS = [ + ":tractor:", + ":farmer:", + ":corn:", + ":strawberry:", + ":ear_of_rice:", + ":seedling:", + ":hatching_chick:", + ":ladybug:", + ":sunflower:", +] + + +@retry(wait=wait_exponential(multiplier=1, min=4, max=10)) +def _fetch_repo(repo_name: str): + """Fetch the repo from source.coop.""" + r = requests.get( + urljoin(SOURCE_COOP_URL, f"/api/v1/repositories/fiboa/{repo_name}") + ) + r.raise_for_status() + resp_json = r.json() + return resp_json + + +def _send_slack_notification(repo_name: str, webhook_client: WebhookClient): + """Send a notification to a slack webhook, indicating a new fiboa dataset is available.""" + repo = _fetch_repo(repo_name) + repo_meta = repo["meta"] + + formatted_tags = " ".join(sorted([f"`{tag}`" for tag in repo_meta["tags"]])) + emoji = random.choice(EMOJIS) + body = { + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": f"{repo_meta['title']} {emoji}", + "emoji": True, + }, + }, + {"type": "section", "text": {"type": "mrkdwn", "text": formatted_tags}}, + {"type": "divider"}, + { + "type": "section", + "text": {"type": "mrkdwn", "text": repo_meta["description"]}, + "accessory": { + "type": "button", + "text": { + "type": "plain_text", + "text": "source.coop", + "emoji": True, + }, + "value": "source.coop", + "style": "primary", + "url": f"https://source.coop/repositories/fiboa/{repo_name}/description", + "action_id": "button-action", + }, + }, + ] + } + print("SENDING BODY - ", body) + # webhook_client.send(**body) + + +def handler(event, context): + rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=3) + webhook_client = WebhookClient( + url=SLACK_WEBHOOK_URL, + retry_handlers=[rate_limit_handler], + timeout=10, + ) + for record in event["Records"]: + key: str = record["s3"]["object"]["key"] + if not key.startswith("fiboa/"): + logger.info(f"Skipping key - {key}") + continue + repo_name = key.split("/")[1] + if key.endswith("README.md"): + # New source.coop dataset, send slack notification. + repo_name = key.split("/")[1] + _send_slack_notification(repo_name, webhook_client) + logger.info(f"Sent slack notification for {repo_name}") + elif key.endswith(".parquet"): + # New fiboa dataset, send to AWS batch. + pass + else: + logger.info(f"Skipping key - {key}") + continue diff --git a/infra/requirements-dev.txt b/infra/requirements-dev.txt new file mode 100644 index 0000000..9270945 --- /dev/null +++ b/infra/requirements-dev.txt @@ -0,0 +1 @@ +pytest==6.2.5 diff --git a/infra/requirements.txt b/infra/requirements.txt new file mode 100644 index 0000000..805552e --- /dev/null +++ b/infra/requirements.txt @@ -0,0 +1,2 @@ +aws-cdk-lib==2.190.0 +constructs>=10.0.0,<11.0.0 From 22aaa053531456b50b7bd8feb373d026c40b7796 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:04:59 -0600 Subject: [PATCH 02/13] add batch stack --- infra/app.py | 15 +++---- infra/aws/fiboa_sda.py | 90 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 83 insertions(+), 22 deletions(-) diff --git a/infra/app.py b/infra/app.py index 94e244a..74dd1f2 100644 --- a/infra/app.py +++ b/infra/app.py @@ -7,22 +7,19 @@ app = cdk.App() -FiboaSdaStack(app, "FiboaSdaStackTesting", +FiboaSdaStack( + app, + "FiboaSdaStackTesting", # If you don't specify 'env', this stack will be environment-agnostic. # Account/Region-dependent features and context lookups will not work, # but a single synthesized template can be deployed anywhere. - # Uncomment the next line to specialize this stack for the AWS Account # and Region that are implied by the current CLI configuration. - - #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), - + # env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), # Uncomment the next line if you know exactly what Account and Region you # want to deploy the stack to. */ - - #env=cdk.Environment(account='123456789012', region='us-east-1'), - + # env=cdk.Environment(account='123456789012', region='us-east-1'), # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html - ) +) app.synth() diff --git a/infra/aws/fiboa_sda.py b/infra/aws/fiboa_sda.py index 4aefbd1..00cab6c 100644 --- a/infra/aws/fiboa_sda.py +++ b/infra/aws/fiboa_sda.py @@ -1,34 +1,98 @@ import os from aws_cdk import ( + aws_ec2 as ec2, + aws_ecs as ecs, + aws_ecr_assets as ecr_assets, aws_lambda as _lambda, - aws_s3 as _s3, + aws_iam as iam, + aws_s3 as s3, + aws_batch as batch, aws_s3_notifications, - Stack, Duration + Stack, + Duration, + Size, ) from constructs import Construct -class FiboaSdaStack(Stack): +class FiboaSdaStack(Stack): def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: super().__init__(scope, construct_id, **kwargs) - # Create a new container image - ecr_image = _lambda.EcrImageCode.from_asset_image( - directory=os.path.join(os.getcwd(), "lambda-image") + # Create a VPC for the batch fargate cluster + vpc = ec2.Vpc(self, "VPC") + + # Create AWS Batch Job Queue + self.batch_queue = batch.JobQueue(self, "JobQueue") + fargate_spot_environment = batch.FargateComputeEnvironment( + self, + "FargateSpotEnv", + vpc_subnets=ec2.SubnetSelection( + subnet_type=ec2.SubnetType.PRIVATE_WITH_NAT + ), + vpc=vpc, + spot=True, + ) + self.batch_queue.add_compute_environment(fargate_spot_environment, 0) + + # Task execution IAM role for Fargate + task_execution_role = iam.Role( + self, + "TaskExecutionRole", + assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + "service-role/AmazonECSTaskExecutionRolePolicy" + ) + ], + ) + + # image_asset = ecr_assets.DockerImageAsset( + # self, "MyImageAsset", + # directory=os.path.join(os.getcwd(), "..") + # ) + + # Create Job Definition to submit job in batch job queue. + batch.EcsJobDefinition( + self, + "MyJobDef", + container=batch.EcsFargateContainerDefinition( + self, + "FargateCDKJobDef", + image=ecs.ContainerImage.from_asset( + directory=os.path.join(os.getcwd(), "..") + ), + command=["ingest-one"], + memory=Size.gibibytes(16), + cpu=2, + execution_role=task_execution_role, + ), ) # create lambda function - function = _lambda.Function(self, "fiboa-s3-listener", - runtime=_lambda.Runtime.FROM_IMAGE, - handler=_lambda.Handler.FROM_IMAGE, - architecture=_lambda.Architecture.ARM_64, - timeout=Duration.seconds(10), code=ecr_image) + # todo - create an IAM role with access to batch. + # todo - inject environment + function = _lambda.Function( + self, + "fiboa-s3-listener", + runtime=_lambda.Runtime.FROM_IMAGE, + handler=_lambda.Handler.FROM_IMAGE, + architecture=_lambda.Architecture.ARM_64, + timeout=Duration.seconds(10), + code=_lambda.EcrImageCode.from_asset_image( + directory=os.path.join(os.getcwd(), "lambda-image"), + ), + ) # create s3 bucket - s3 = _s3.Bucket(self, "fiboa-sda-testing") + bucket = s3.Bucket(self, "fiboa-sda-testing") # create s3 notification for lambda function notification = aws_s3_notifications.LambdaDestination(function) # assign notification for the s3 event type (ex: OBJECT_CREATED) - s3.add_event_notification(_s3.EventType.OBJECT_CREATED, notification, _s3.NotificationKeyFilter(prefix="fiboa/*")) \ No newline at end of file + bucket.add_event_notification( + s3.EventType.OBJECT_CREATED, + notification, + s3.NotificationKeyFilter(prefix="fiboa/*"), + ) From 5f9bd26749236e337e3d5240b15cb3ea81d5fe97 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:05:06 -0600 Subject: [PATCH 03/13] add deploy action --- .github/workflows/aws-deploy.yaml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/aws-deploy.yaml diff --git a/.github/workflows/aws-deploy.yaml b/.github/workflows/aws-deploy.yaml new file mode 100644 index 0000000..a8d4891 --- /dev/null +++ b/.github/workflows/aws-deploy.yaml @@ -0,0 +1,31 @@ +on: + push: + - add-lambda-handler + +jobs: + aws_cdk: + runs-on: ubuntu-latest + steps: + + - name: cdk diff + uses: youyo/aws-cdk-github-actions@v2 + with: + cdk_subcommand: 'diff' + actions_comment: true + working_dir: 'infra' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: 'us-west-2' + + - name: cdk deploy + uses: youyo/aws-cdk-github-actions@v2 + with: + cdk_subcommand: 'deploy' + cdk_args: '--require-approval never' + actions_comment: false + working_dir: 'infra' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: 'us-west-2' \ No newline at end of file From bf39847d646236570b86c8511a92dc267c6ae225 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:05:43 -0600 Subject: [PATCH 04/13] syntax --- .github/workflows/aws-deploy.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/aws-deploy.yaml b/.github/workflows/aws-deploy.yaml index a8d4891..78ecfb5 100644 --- a/.github/workflows/aws-deploy.yaml +++ b/.github/workflows/aws-deploy.yaml @@ -1,6 +1,7 @@ on: push: - - add-lambda-handler + branches: + - add-lambda-handler jobs: aws_cdk: From c930a71231a4b79e72d495796137190cf56210c4 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:06:47 -0600 Subject: [PATCH 05/13] delete publish.yaml, checkout repo --- .github/workflows/aws-deploy.yaml | 4 ++- .github/workflows/publish.yaml | 46 ------------------------------- 2 files changed, 3 insertions(+), 47 deletions(-) delete mode 100644 .github/workflows/publish.yaml diff --git a/.github/workflows/aws-deploy.yaml b/.github/workflows/aws-deploy.yaml index 78ecfb5..00577c4 100644 --- a/.github/workflows/aws-deploy.yaml +++ b/.github/workflows/aws-deploy.yaml @@ -7,7 +7,9 @@ jobs: aws_cdk: runs-on: ubuntu-latest steps: - + - name: Check out the repo + uses: actions/checkout@v4 + - name: cdk diff uses: youyo/aws-cdk-github-actions@v2 with: diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml deleted file mode 100644 index 5502eb2..0000000 --- a/.github/workflows/publish.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: Publish Docker image - -on: - release: - types: [published] - -jobs: - push_to_registry: - name: Push Docker image to Docker Hub - runs-on: ubuntu-latest - permissions: - packages: write - contents: read - attestations: write - id-token: write - steps: - - name: Check out the repo - uses: actions/checkout@v4 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-west-2 - - - name: Login to Amazon ECR - id: login-ecr - uses: aws-actions/amazon-ecr-login@v1 - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: 767828762635.dkr.ecr.us-west-2.amazonaws.com/fiboa/fiboa-sda - - - name: Build and push Docker image - id: push - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - From 4814b2bc2e9c5e547f042a57ad40efe5e13c8c93 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:08:18 -0600 Subject: [PATCH 06/13] fix import structure --- infra/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/app.py b/infra/app.py index 74dd1f2..216906e 100644 --- a/infra/app.py +++ b/infra/app.py @@ -3,7 +3,7 @@ import aws_cdk as cdk -from infra.aws.fiboa_sda import FiboaSdaStack +from aws.fiboa_sda import FiboaSdaStack app = cdk.App() From bb2fe1f383ee6781f50eb016399c00dc68404b07 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:09:55 -0600 Subject: [PATCH 07/13] skip diff --- .github/workflows/aws-deploy.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/aws-deploy.yaml b/.github/workflows/aws-deploy.yaml index 00577c4..9f02b2e 100644 --- a/.github/workflows/aws-deploy.yaml +++ b/.github/workflows/aws-deploy.yaml @@ -9,17 +9,17 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v4 - - - name: cdk diff - uses: youyo/aws-cdk-github-actions@v2 - with: - cdk_subcommand: 'diff' - actions_comment: true - working_dir: 'infra' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: 'us-west-2' + + # - name: cdk diff + # uses: youyo/aws-cdk-github-actions@v2 + # with: + # cdk_subcommand: 'diff' + # actions_comment: true + # working_dir: 'infra' + # env: + # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + # AWS_DEFAULT_REGION: 'us-west-2' - name: cdk deploy uses: youyo/aws-cdk-github-actions@v2 From 780418f999d2df3e1e8667c63ca95bd8f66d2b0f Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 22:11:22 -0600 Subject: [PATCH 08/13] add .dockerignore --- .dockerignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0d97f6f --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +infra/ \ No newline at end of file From 8663daba6c895963b81aacea0402b0c28562c405 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Tue, 13 May 2025 23:29:33 -0600 Subject: [PATCH 09/13] ;unsigned requests --- fiboa_sda/ingest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fiboa_sda/ingest.py b/fiboa_sda/ingest.py index 2b0f101..86971dd 100644 --- a/fiboa_sda/ingest.py +++ b/fiboa_sda/ingest.py @@ -1,10 +1,11 @@ import functools import concurrent.futures import json -import multiprocessing import tempfile import boto3 +from botocore import UNSIGNED +from botocore.config import Config import geopandas as gpd import pandas as pd import pyarrow as pa @@ -17,7 +18,7 @@ settings = get_settings() logger = get_logger(__name__) BUCKET_NAME = "us-west-2.opendata.source.coop" -S3_CLIENT = boto3.client('s3', region_name="us-west-2") +S3_CLIENT = boto3.client('s3', region_name="us-west-2", config=Config(signature_version=UNSIGNED)) # pyarrow schema used when writing out parquet # files to BQ From 6c7b2f24dc5563cb432b3d63fed7dc276a7f394a Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Wed, 14 May 2025 22:52:19 -0600 Subject: [PATCH 10/13] inject credentials --- fiboa_sda/ingest.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fiboa_sda/ingest.py b/fiboa_sda/ingest.py index 2b0f101..12298df 100644 --- a/fiboa_sda/ingest.py +++ b/fiboa_sda/ingest.py @@ -1,7 +1,7 @@ import functools import concurrent.futures import json -import multiprocessing +import os import tempfile import boto3 @@ -9,6 +9,7 @@ import pandas as pd import pyarrow as pa from google.cloud import bigquery +from google.oauth2 import service_account from fiboa_sda.logger import get_logger, TimerFunc from fiboa_sda.metrics import calculate_geometry_metrics @@ -67,7 +68,11 @@ def get_s3_key_for_dataset(fiboa_id: str) -> list[str]: def write_to_bq( df: pd.DataFrame, project_name: str, dataset_name: str, table_name: str ) -> None: - client = bigquery.Client(project=project_name) + credentials = None + if service_account_file := os.getenv("SERVICE_ACCOUNT_FILE"): + credentials = service_account.Credentials.from_service_account_info(json.loads(service_account_file)) + + client = bigquery.Client(project=project_name, credentials=credentials) job_config = bigquery.LoadJobConfig(source_format="PARQUET") with tempfile.NamedTemporaryFile() as tmp: From 2b4fcee209b526de44a1509c9c07c8bbee66265c Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Wed, 14 May 2025 23:32:47 -0600 Subject: [PATCH 11/13] bugfix --- fiboa_sda/ingest.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fiboa_sda/ingest.py b/fiboa_sda/ingest.py index 814fe3e..c333526 100644 --- a/fiboa_sda/ingest.py +++ b/fiboa_sda/ingest.py @@ -97,12 +97,13 @@ def normalize_dataset(df: gpd.GeoDataFrame, repository_id: str, s3_path: str) -> # Dump any fields that aren't part of fiboa into a JSON column. available_external_fields = set(df.columns) - set(settings.FIBOA_FIELDS) - df["external_fields"] = df.apply( - lambda row: json.dumps( - {field: row[field] for field in available_external_fields} - ), - axis=1, - ) + + def _coalesce(row): + d = row.to_dict() + data = {field: d[field] for field in available_external_fields} + return json.dumps(data) + + df["external_fields"] = df.apply(_coalesce, axis=1) df = df[list(settings.FIBOA_FIELDS) + ["external_fields"]] # Drop the geometry-metrics fields, we'll recalculate these. From f5a62b133c0673bcd4dd089f5b8b534fa168800e Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Thu, 15 May 2025 06:36:05 -0600 Subject: [PATCH 12/13] switch to source.coop bucket --- infra/aws/fiboa_sda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/infra/aws/fiboa_sda.py b/infra/aws/fiboa_sda.py index 00cab6c..53f8940 100644 --- a/infra/aws/fiboa_sda.py +++ b/infra/aws/fiboa_sda.py @@ -85,7 +85,8 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: ), ) # create s3 bucket - bucket = s3.Bucket(self, "fiboa-sda-testing") + # bucket = s3.Bucket(self, "fiboa-sda-testing") + bucket = s3.Bucket.from_bucket_name(self, "source-coop-bucket", bucket_name="us-west-2.opendata.source.coop") # create s3 notification for lambda function notification = aws_s3_notifications.LambdaDestination(function) From 0c61616c6ac4569b8b7007f405c3be6903c008a2 Mon Sep 17 00:00:00 2001 From: Jeff Albrecht Date: Thu, 15 May 2025 13:07:26 -0600 Subject: [PATCH 13/13] use sns event source with prefix filter --- infra/aws/fiboa_sda.py | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/infra/aws/fiboa_sda.py b/infra/aws/fiboa_sda.py index 53f8940..b301c2a 100644 --- a/infra/aws/fiboa_sda.py +++ b/infra/aws/fiboa_sda.py @@ -5,10 +5,12 @@ aws_ecs as ecs, aws_ecr_assets as ecr_assets, aws_lambda as _lambda, + aws_lambda_event_sources as lambda_events, aws_iam as iam, aws_s3 as s3, aws_batch as batch, aws_s3_notifications, + aws_sns as sns, Stack, Duration, Size, @@ -84,16 +86,32 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: directory=os.path.join(os.getcwd(), "lambda-image"), ), ) - # create s3 bucket - # bucket = s3.Bucket(self, "fiboa-sda-testing") - bucket = s3.Bucket.from_bucket_name(self, "source-coop-bucket", bucket_name="us-west-2.opendata.source.coop") - # create s3 notification for lambda function - notification = aws_s3_notifications.LambdaDestination(function) - - # assign notification for the s3 event type (ex: OBJECT_CREATED) - bucket.add_event_notification( - s3.EventType.OBJECT_CREATED, - notification, - s3.NotificationKeyFilter(prefix="fiboa/*"), + # add SNS event source + topic = sns.Topic.from_topic_arn( + self, + "source-coop-topic", + topic_arn="arn:aws:sns:us-west-2:417712557820:us-west-2-opendata-source-coop_new-object" ) + event_source = lambda_events.SnsEventSource( + topic, + filter_policy={ + "S3Key": sns.SubscriptionFilter.string_filter(match_prefixes=['fiboa']) + } + ) + function.add_event_source(event_source) + + + # # create s3 bucket + # # bucket = s3.Bucket(self, "fiboa-sda-testing") + # bucket = s3.Bucket.from_bucket_name(self, "source-coop-bucket", bucket_name="us-west-2.opendata.source.coop") + + # # create s3 notification for lambda function + # # notification = aws_s3_notifications.LambdaDestination(function) + + # # assign notification for the s3 event type (ex: OBJECT_CREATED) + # bucket.add_event_notification( + # s3.EventType.OBJECT_CREATED, + # notification, + # s3.NotificationKeyFilter(prefix="fiboa/*"), + # )