emr serverless python dependencies (#250)

cloutierMat · web-flow · commit ec34fa01df04 · 2024-07-18T09:21:35.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ node_modules/
 .project
 .settings/
 target/
+volume/
 
 .idea/
 
@@ -13,7 +14,7 @@ __pycache__/
 *.log
 
 .terraform/
-terraform.tfstate
+terraform.tfstate*
 .terraform.lock*
 
 .venv/
diff --git a/emr-serverless-python-dependencies/Dockerfile-aws b/emr-serverless-python-dependencies/Dockerfile-aws
@@ -0,0 +1,58 @@
+# This is a muti-stage Dockerfile that can be used to build many different types of
+# bundled dependencies for PySpark projects. 
+# The `base` stage installs generic tools necessary for packaging.
+#
+# There are `export-` and `build-` stages for the different types of projects.
+# - python-packages - Generic support for Python projects with pyproject.toml
+# - poetry - Support for Poetry projects
+#
+# This Dockerfile is generated automatically as part of the emr-cli tool.
+# Feel free to modify it for your needs, but leave the `build-` and `export-`
+# stages related to your project.
+#
+# To build manually, you can use the following command, assuming 
+# the Docker BuildKit backend is enabled. https://docs.docker.com/build/buildkit/
+#
+# Example for building a poetry project and saving the output to dist/ folder
+# docker build --target export-poetry --output dist .
+
+
+## ----------------------------------------------------------------------------
+##  Base stage for python development
+## ----------------------------------------------------------------------------
+FROM --platform=linux/amd64 amazonlinux:2 AS base
+
+RUN yum install -y python3 tar gzip
+
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# EMR 6.x uses Python 3.7 - limit Poetry version to 1.5.1
+ENV POETRY_VERSION=1.5.1
+RUN python3 -m pip install --upgrade pip
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+ENV PATH="$PATH:/root/.local/bin"
+
+WORKDIR /app
+
+COPY . .
+
+# Test stage - installs test dependencies defined in pyproject.toml
+FROM base as test
+RUN python3 -m pip install .[test]
+
+
+## ----------------------------------------------------------------------------
+##  Build and export stages for Poetry Python projects
+## ----------------------------------------------------------------------------
+# Build stage for poetry
+FROM base as build-poetry
+RUN poetry self add poetry-plugin-bundle && \
+    poetry bundle venv dist/bundle && \
+    tar -czvf dist/pyspark_deps.tar.gz -C dist/bundle . && \
+    rm -rf dist/bundle
+
+FROM scratch as export-poetry
+COPY --from=build-poetry /app/dist/pyspark_deps.tar.gz /
diff --git a/emr-serverless-python-dependencies/Dockerfile-localstack b/emr-serverless-python-dependencies/Dockerfile-localstack
@@ -0,0 +1,30 @@
+## ----------------------------------------------------------------------------
+##  Base stage for python development
+## ----------------------------------------------------------------------------
+FROM --platform=linux/amd64 localstack/localstack:latest AS base
+
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# EMR 6.x uses Python 3.7 - limit Poetry version to 1.5.1
+ENV POETRY_VERSION=1.5.1
+RUN python3 -m pip install --upgrade pip
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+ENV PATH="$PATH:/root/.local/bin"
+
+WORKDIR /app
+
+COPY . .
+
+## ----------------------------------------------------------------------------
+##  Build and export stages for standard Python projects
+## ----------------------------------------------------------------------------
+# Build stage - installs required dependencies and creates a venv package
+FROM base as build-poetry
+RUN poetry self add poetry-plugin-bundle && \
+    poetry bundle venv dist/bundle
+
+FROM scratch as export-poetry
+COPY --from=build-poetry /app/dist/bundle /pyspark_env/
diff --git a/emr-serverless-python-dependencies/Makefile b/emr-serverless-python-dependencies/Makefile
@@ -0,0 +1,48 @@
+export AWS_ACCESS_KEY_ID ?= test
+export AWS_SECRET_ACCESS_KEY ?= test
+export AWS_DEFAULT_REGION = us-east-1
+
+init:
+	terraform workspace new local &
+	terraform workspace new aws &
+	terraform init
+
+build:
+	docker build . --file Dockerfile-localstack --output .
+
+build-aws:
+	docker build . --file Dockerfile-aws --output .
+
+deploy:
+	docker-compose up --detach
+	terraform workspace select local
+	AWS_ENDPOINT_URL=https://localhost.localstack.cloud:4566 terraform apply --auto-approve
+
+deploy-aws:
+	terraform workspace select aws
+	terraform apply --auto-approve
+
+run:
+	terraform workspace select local
+	./start_job.sh local
+
+run-aws:
+	terraform workspace select aws
+	./start_job.sh aws
+
+stop:
+	docker-compose down
+
+destroy:
+	terraform workspace select local
+	./stop-application.sh
+	terraform destroy --auto-approve
+
+destroy-aws:
+	terraform workspace select aws
+	./stop-application.sh aws
+	terraform destroy --auto-approve
+
+test-ci:
+	make init build deploy run; return_code=`echo $$?`;\
+	make stop; exit $$return_code;
diff --git a/emr-serverless-python-dependencies/README.md b/emr-serverless-python-dependencies/README.md
@@ -0,0 +1,69 @@
+# EMR Serverless with Python dependencies
+
+[AWS has this example](https://github.com/aws-samples/emr-serverless-samples/tree/main/examples/pyspark/dependencies) of how to add python dependencies to an emr job. Unfortunately, the same pattern isn't currently possible on LocalStack. This here will serve as a example of how to implement a workaround to still be able to add your own dependencies and module to your emr Spark jobs
+
+## Requirements
+- Make
+- Terraform ~>1.9.1
+- [LocalStack](https://github.com/localstack/localstack)
+- [awslocal](https://github.com/localstack/awscli-local)
+
+## init
+
+This will initialize your terraform and terraform workspaces
+
+```
+make init
+```
+
+## Build
+
+This will build the python dependencies for the Spark job. This is where the first difference with AWS happens, as we will not package it like we do for aws, but intead will save the environment to our project folder to mount it to Localstack countainer.
+
+```
+# For LocalStack, we create a /pyspark_env folder
+make build
+
+# For aws, we create pyspark_deps.tar.gz
+make build-aws
+```
+
+## Deploy
+
+Creates the following resources
+- iam role
+- iam policy
+- s3 bucket
+- emr-serverless application
+
+```
+# Starts localstack using docker-compose, and apply the terraform configuration.
+LOCALSTACK_AUTH_TOKEN=<your_auth_token> make deploy
+
+# apply terraform configuration to AWS
+make deploy-aws
+```
+
+## Run job
+
+We can finally run our spark job. Notice the differences in the `start_job.sh` for LocalStack and aws. For aws we add `spark.archives` to our configuration and reference the path for the environment as `environment/bin/python`. Whereas for LocalStack, we rely on the volume mounted on our container instead of the archives and are using the absolute path for the environment `/tmp/environment/bin/python`.
+
+```
+# LocalStack
+make run
+
+# aws
+make run-aws
+```
+
+## Destroy
+
+Finally we can destroy the environment. We make sure to stop the application first.
+
+```
+# LocalStack
+make destroy
+
+# aws
+make destroy-aws
+```
diff --git a/emr-serverless-python-dependencies/docker-compose.yml b/emr-serverless-python-dependencies/docker-compose.yml
@@ -0,0 +1,22 @@
+services:
+  localstack:
+    container_name: "${LOCALSTACK_DOCKER_NAME:-localstack-main}"
+    # Using this image will significantly decrease the job execution time
+    # image: localstack/localstack-pro:latest-bigdata
+    image: localstack/localstack-pro:latest
+    ports:
+      - "127.0.0.1:4566:4566"            # LocalStack Gateway
+      - "127.0.0.1:4510-4559:4510-4559"  # external services port range
+      - "127.0.0.1:443:443"              # LocalStack HTTPS Gateway (Pro)
+    environment:
+      # Activate LocalStack Pro: https://docs.localstack.cloud/getting-started/auth-token/
+      - LOCALSTACK_AUTH_TOKEN=${LOCALSTACK_AUTH_TOKEN:-}  # required for Pro
+      - LOCALSTACK_API_KEY=${LOCALSTACK_API_KEY:-}  # required for CI
+      # LocalStack configuration: https://docs.localstack.cloud/references/configuration/
+      - DEBUG=${DEBUG:-0}
+      - PERSISTENCE=${PERSISTENCE:-0}
+      - HIVE_DEFAULT_VERSION=3.1.3
+    volumes:
+      - "${LOCALSTACK_VOLUME_DIR:-./volume}:/var/lib/localstack"
+      - "/var/run/docker.sock:/var/run/docker.sock"
+      - "./pyspark_env:/tmp/environment"
diff --git a/emr-serverless-python-dependencies/emr-sample-access-policy.json.tfpl b/emr-serverless-python-dependencies/emr-sample-access-policy.json.tfpl
@@ -0,0 +1,51 @@
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "ReadAccessForEMRSamples",
+            "Effect": "Allow",
+            "Action": [
+                "s3:GetObject",
+                "s3:ListBucket"
+            ],
+            "Resource": [
+                "arn:aws:s3:::*.elasticmapreduce",
+                "arn:aws:s3:::*.elasticmapreduce/*"
+            ]
+        },
+        {
+            "Sid": "FullAccessToOutputBucket",
+            "Effect": "Allow",
+            "Action": [
+                "s3:PutObject",
+                "s3:GetObject",
+                "s3:ListBucket",
+                "s3:DeleteObject"
+            ],
+            "Resource": [
+                "arn:aws:s3:::${bucket}",
+                "arn:aws:s3:::${bucket}/*"
+            ]
+        },
+        {
+            "Sid": "GlueCreateAndReadDataCatalog",
+            "Effect": "Allow",
+            "Action": [
+                "glue:GetDatabase",
+                "glue:CreateDatabase",
+                "glue:GetDataBases",
+                "glue:CreateTable",
+                "glue:GetTable",
+                "glue:UpdateTable",
+                "glue:DeleteTable",
+                "glue:GetTables",
+                "glue:GetPartition",
+                "glue:GetPartitions",
+                "glue:CreatePartition",
+                "glue:BatchCreatePartition",
+                "glue:GetUserDefinedFunctions"
+            ],
+            "Resource": ["*"]
+        }
+    ]
+}
diff --git a/emr-serverless-python-dependencies/emr-serverless-trust-policy.json b/emr-serverless-python-dependencies/emr-serverless-trust-policy.json
@@ -0,0 +1,11 @@
+{
+    "Version": "2012-10-17",
+    "Statement": [{
+        "Sid": "EMRServerlessTrustPolicy",
+        "Action": "sts:AssumeRole",
+        "Effect": "Allow",
+        "Principal": {
+            "Service": "emr-serverless.amazonaws.com"
+        }
+    }]
+}
diff --git a/emr-serverless-python-dependencies/entrypoint.py b/emr-serverless-python-dependencies/entrypoint.py
@@ -0,0 +1,9 @@
+from jobs.spark_run import SparkRun
+
+# importing typer to validate it is in the environment
+import typer
+
+if __name__ == "__main__":
+    spark_runner = SparkRun()
+    spark_runner.run()
+    spark_runner.stop()
diff --git a/emr-serverless-python-dependencies/jobs/spark_run.py b/emr-serverless-python-dependencies/jobs/spark_run.py
@@ -0,0 +1,22 @@
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col
+
+class SparkRun:
+
+    def __init__(self) -> None:
+        self.spark = SparkSession.builder.appName("ExtremeWeather").getOrCreate()
+
+    def run(self) -> None:
+        df = self.spark.createDataFrame(
+            [
+                ("sue", 32),
+                ("li", 3),
+                ("bob", 75),
+                ("heo", 13),
+            ],
+            ["first_name", "age"],
+        )
+        print(df.select(col("first_name"), col("age")).first())
+
+    def stop(self):
+        self.spark.stop()
diff --git a/emr-serverless-python-dependencies/main.tf b/emr-serverless-python-dependencies/main.tf
diff --git a/emr-serverless-python-dependencies/pyproject.toml b/emr-serverless-python-dependencies/pyproject.toml
diff --git a/emr-serverless-python-dependencies/start_job.sh b/emr-serverless-python-dependencies/start_job.sh
diff --git a/emr-serverless-python-dependencies/stop-application.sh b/emr-serverless-python-dependencies/stop-application.sh