Add the example about how to use Flex Template with RunInference (#18)

liferoad · web-flow · commit 7a4421bf727a · 2023-09-08T20:59:28.000-04:00
* add the flex template

* updated the dockerfile

* updated the parameters

* update the dockerfile

* added PYTHONPATH

* remove the pubsub option

* remove time
diff --git a/Makefile b/Makefile
@@ -207,3 +207,49 @@ check-torch-gpu: ## Check whether PyTorch works on GPU using VM with Custom Cont
 
 check-pipeline: ## Check whether the Beam pipeline can run on GPU using VM with Custom Container and DirectRunner
 	@./scripts/check-pipeline.sh
+
+create-flex-template: ## Create a Flex Template file using a Flex Template custom container
+	gcloud dataflow flex-template build $(TEMPLATE_FILE_GCS_PATH) \
+	--image $(CUSTOM_CONTAINER_IMAGE) \
+	--metadata-file ./flex/metadata.json \
+	--sdk-language "PYTHON" \
+	--staging-location $(STAGING_LOCATION) \
+	--temp-location $(TEMP_LOCATION) \
+	--project $(PROJECT_ID) \
+	--worker-region $(REGION) \
+	--worker-machine-type $(MACHINE_TYPE)
+
+run-df-gpu-flex: ## Run a Dataflow job using the Flex Template
+	$(eval JOB_NAME := beam-ml-starter-gpu-flex-$(shell date +%s)-$(shell echo $$$$))
+ifeq ($(MODEL_ENV), "TORCH")
+	gcloud dataflow flex-template run $(JOB_NAME) \
+	--template-file-gcs-location $(TEMPLATE_FILE_GCS_PATH) \
+	--project $(PROJECT_ID) \
+	--region $(REGION) \
+	--worker-machine-type $(MACHINE_TYPE) \
+	--additional-experiments disable_worker_container_image_prepull \
+	--parameters number_of_worker_harness_threads=1 \
+	--parameters sdk_location=container \
+	--parameters sdk_container_image=$(CUSTOM_CONTAINER_IMAGE) \
+	--parameters dataflow_service_option=$(SERVICE_OPTIONS) \
+	--parameters input=$(INPUT_DATA) \
+	--parameters output=$(OUTPUT_DATA) \
+	--parameters device=GPU \
+	--parameters model_state_dict_path=$(MODEL_STATE_DICT_PATH) \
+	--parameters model_name=$(MODEL_NAME)
+else
+	gcloud dataflow flex-template run $(JOB_NAME) \
+	--template-file-gcs-location $(TEMPLATE_FILE_GCS_PATH) \
+	--project $(PROJECT_ID) \
+	--region $(REGION) \
+	--worker-machine-type $(MACHINE_TYPE) \
+	--additional-experiments disable_worker_container_image_prepull \
+	--parameters number_of_worker_harness_threads=1 \
+	--parameters sdk_location=container \
+	--parameters sdk_container_image=$(CUSTOM_CONTAINER_IMAGE) \
+	--parameters dataflow_service_option=$(SERVICE_OPTIONS) \
+	--parameters input=$(INPUT_DATA) \
+	--parameters output=$(OUTPUT_DATA) \
+	--parameters device=GPU \
+	--parameters tf_model_uri=$(TF_MODEL_URI)
+endif
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ newgrp docker
 ### Step 1: Clone this repo and edit .env
 
 ```bash
-git clone https://github.com/liferoad/df-ml-starter.git
+git clone https://github.com/google/dataflow-ml-starter.git
 cd df-ml-starter
 cp .env.template .env
 ```
@@ -271,9 +271,23 @@ Note the cost and time depends on your job settings and the regions.
 ### Run the Beam pipeline with the Pub/Sub source
 When `INPUT_DATA` from the `.env` file defines a valid Pub/Sub topic (e.g., `projects/apache-beam-testing/topics/Imagenet_openimage_50k_benchmark`),
 the Beam pipeline is created using the Pub/Sub source with `FixedWindows` and switches to `beam.io.fileio.WriteToFiles` that supports the streaming pipeline.
-We use `shards=0` here since 0 shards is the recommended approach and Dataflow would decide how many files it should write.
+Note for this toy example, writing the predictions to a GCS bucket is not efficient since the file size is quite small with few bytes.
+In practice, you might tune up [the autoscaling options](https://cloud.google.com/dataflow/docs/guides/troubleshoot-autoscaling) to optimize the streaming pipeline performance.
 Note that the streaming job will run forever until it is canceled or drained.
 
+### Run the Beam pipeline with Dataflow Flex Templates
+If you prefer to package all your code into a custom container and allow users to easily access your Beam pipeline,
+Dataflow Flex Template could be handy to create and run a Flex Template job using Google Cloud CLI or Google Cloud console. (More benefits about templates are [here](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates#benefits).)
+
+Since the custom container is already created, it is straightforward to adapt Dataflow Flex Templates:
+1. create a `metadata.json` file that contains the parameters required by your Beam pipeline. In this example, we can add `input`, `output`, `device`, `model_name`, `model_state_dict_path`, and `tf_model_uri` as the parameters that can be passed in by users. [Here](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates#example-metadata-file) is another example metadata file.
+2. convert the custom container to your template container following [this](https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images). `tensorflow_gpu.flex.Dockerfile` is one example converted from `tensorflow_gpu.Dockerfile`. Only two parts are needed: switch to the Dataflow Template launcher entrypoint and package `src` into this container. Change `CUSTOM_CONTAINER_IMAGE` in `.env` and run `make docker` to create the custom container for Flex Templates.
+3. `make create-flex-template` creates a template spec file in a Cloud Storage bucket defined by the env `TEMPLATE_FILE_GCS_PATH` that contains all of the necessary information to run the job, such as the SDK information and metadata. This calls the CLI `gcloud dataflow flex-template build`.
+4. `make run-df-gpu-flex` runs a Flex Template pipeline using the spec file from `TEMPLATE_FILE_GCS_PATH`. This calls the CLI `gcloud dataflow flex-template run`.
+
+More information about Flex Templates can be found [here](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates).
+
+
 ## FAQ
 
 ### Permission error when using any GCP command
@@ -328,4 +342,7 @@ exec /opt/apache/beam/boot: no such file or directory
 * https://cloud.google.com/dataflow/docs/gpu/use-gpus#custom-container
 * https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/
 * https://github.com/apache/beam/blob/master/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy
-* https://cloud.google.com/dataflow/docs/gpu/troubleshoot-gpus#debug-vm
+* https://cloud.google.com/dataflow/docs/gpu/troubleshoot-gpus#debug-vm
+* https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/dataflow/flex-templates/streaming_beam
+* https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates
+* https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images
diff --git a/flex/metadata.json b/flex/metadata.json
@@ -0,0 +1,52 @@
+{
+    "name": "Beam RunInference Python flex template",
+    "description": "Beam RunInference example for python flex template.",
+    "parameters": [
+      {
+        "name": "input",
+        "label": "Input data",
+        "helpText": "Input image URI data that could be a GCS bucket or pub/sub topic"
+      },
+      {
+        "name": "output",
+        "label": "Output GCS bucket path",
+        "helpText": "A GCS bucket that stores the model predictions"
+      },
+      {
+        "name": "tf_model_uri",
+        "label": "TensorFlow model URI",
+        "helpText": "A valid TensorFlow model URI",
+        "isOptional": true
+      },
+      {
+        "name": "model_name",
+        "label": "a Pytorch model name",
+        "helpText": "A model name, e.g. resnet101",
+        "isOptional": true
+      },
+      {
+        "name": "model_state_dict_path",
+        "label": "a Pytorch model state path",
+        "helpText": "Path to the model's state_dict",
+        "isOptional": true
+      },
+      {
+        "name": "device",
+        "label": "device to run models",
+        "helpText": "device could be either CPU or GPU",
+        "isOptional": true
+      },
+      {
+        "name": "disk_size_gb",
+        "label": "disk_size_gb",
+        "helpText": "disk_size_gb for worker",
+        "isOptional": true
+      },
+      {
+        "name": "dataflow_service_option",
+        "label": "dataflow_service_option",
+        "helpText": "dataflow_service_option for worker",
+        "isOptional": true
+      }
+    ]
+  }
diff --git a/src/pipeline.py b/src/pipeline.py
@@ -31,7 +31,10 @@
 from PIL import Image
 from torchvision import models, transforms
 
-from .config import ModelConfig, ModelName, SinkConfig, SourceConfig
+try:
+    from .config import ModelConfig, ModelName, SinkConfig, SourceConfig
+except ImportError:
+    from config import ModelConfig, ModelName, SinkConfig, SourceConfig
 
 import tensorflow as tf  # isort:skip
 
diff --git a/src/run.py b/src/run.py
@@ -23,8 +23,12 @@
 from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
 from apache_beam.runners.runner import PipelineResult
 
-from .config import ModelConfig, SinkConfig, SourceConfig
-from .pipeline import build_pipeline
+try:
+    from .config import ModelConfig, SinkConfig, SourceConfig
+    from .pipeline import build_pipeline
+except ImportError:
+    from config import ModelConfig, SinkConfig, SourceConfig
+    from pipeline import build_pipeline
 
 
 def parse_known_args(argv):
diff --git a/tensorflow_gpu.flex.Dockerfile b/tensorflow_gpu.flex.Dockerfile
@@ -0,0 +1,71 @@
+#  Copyright 2023 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# This needs Python 3.8 for your local runtime environment
+
+FROM gcr.io/dataflow-templates-base/flex-template-launcher-image:latest as template_launcher
+
+# Select an NVIDIA base image with desired GPU stack from https://ngc.nvidia.com/catalog/containers/nvidia:cuda
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+
+WORKDIR /workspace
+
+COPY requirements.txt requirements.txt
+
+RUN \
+    # Add Deadsnakes repository that has a variety of Python packages for Ubuntu.
+    # See: https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa
+    apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 \
+    && echo "deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \
+    && echo "deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \
+    && apt-get update \
+    && apt-get install -y curl \
+        python3.8 \
+        python3.8-venv \
+        python3-venv \
+        # With python3.8 package, distutils need to be installed separately.
+        python3-distutils \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \
+    && curl https://bootstrap.pypa.io/get-pip.py | python \
+    && pip install --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir tensorflow==2.12.1 \
+    && pip install --no-cache-dir torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
+
+# Copy the run module
+COPY src/ /workspace/src
+RUN rm -fr /workspace/src/__pycache__
+
+#Specifies which Python file to run to launch the Flex Template.
+ENV FLEX_TEMPLATE_PYTHON_PY_FILE="src/run.py"
+
+# Since we already downloaded all the dependencies, there's no need to rebuild everything.
+ENV PIP_NO_DEPS=True
+
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/src/"
+
+# Copy the Dataflow Template launcher
+COPY --from=template_launcher /opt/google/dataflow/python_template_launcher /opt/google/dataflow/python_template_launcher
+
+# Copy files from official SDK image, including script/dependencies.
+# Note Python 3.8 is used due to the base image from nvidia
+COPY --from=apache/beam_python3.8_sdk:${BEAM_VERSION} /opt/apache/beam /opt/apache/beam
+
+# Set the entrypoint to the Dataflow Template launcher
+# Use this if the launcher image is different with the custom container image
+# ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"]
+
+# Set the entrypoint to Apache Beam SDK launcher.
+ENTRYPOINT ["/opt/apache/beam/boot"]