Change src to my_project (#22)

liferoad · tvalentyn · web-flow · commit 2c591d3d274f · 2024-02-23T14:55:57.000-05:00
* Change src to my_project

* Update README.md

Co-authored-by: tvalentyn &lt;tvalentyn@users.noreply.github.com&gt;

---------

Co-authored-by: tvalentyn &lt;tvalentyn@users.noreply.github.com&gt;
diff --git a/.isort.cfg b/.isort.cfg
@@ -19,7 +19,7 @@ import_heading_stdlib=standard libraries
 import_heading_thirdparty=third party libraries
 include_trailing_comma=True
 indent='    '
-known_dfml=src
+known_dfml=my_project
 dedup_headings=True
 line_length=120
 multi_line_output=3
diff --git a/Makefile b/Makefile
@@ -17,6 +17,7 @@ SILENT:
 .DEFAULT_GOAL := help
 
 # Load environment variables from .env file
+TF_MODEL_URI :=
 include .env
 export
 
@@ -64,6 +65,7 @@ init: init-venv ## Init virtual environment
 	@./venv/bin/python3 -m pre_commit install --install-hooks --overwrite
 	@mkdir -p beam-output
 	@echo "use 'source venv/bin/activate' to activate venv "
+	@./venv/bin/python3 -m pip install -e .
 
 format: ## Run formatter on source code
 	@./venv/bin/python3 -m black --config=pyproject.toml .
@@ -85,18 +87,18 @@ clean: clean-lite ## Remove virtual environment, downloaded models, etc
 	@echo "run 'make init'"
 
 test: lint ## Run tests
-	@PYTHONPATH="./:./src" ./venv/bin/pytest -s -vv --cov=src --cov-fail-under=50 tests/
+	./venv/bin/pytest -s -vv --cov=my_project --cov-fail-under=50 tests/
 
 run-direct: ## Run a local test with DirectRunner
 	@rm -f beam-output/beam_test_out.txt
 ifeq ($(MODEL_ENV), "TORCH")
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--input data/openimage_10.txt \
 	--output beam-output/beam_test_out.txt \
 	--model_state_dict_path $(MODEL_STATE_DICT_PATH) \
 	--model_name $(MODEL_NAME)
 else
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--input data/openimage_10.txt \
 	--output beam-output/beam_test_out.txt \
 	--tf_model_uri $(TF_MODEL_URI)
@@ -110,7 +112,7 @@ docker: ## Build a custom docker image and push it to Artifact Registry
 run-df-gpu: ## Run a Dataflow job using the custom container with GPUs
 	$(eval JOB_NAME := beam-ml-starter-gpu-$(shell date +%s)-$(shell echo $$$$))
 ifeq ($(MODEL_ENV), "TORCH")
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--runner DataflowRunner \
 	--job_name $(JOB_NAME) \
 	--project $(PROJECT_ID) \
@@ -132,7 +134,7 @@ ifeq ($(MODEL_ENV), "TORCH")
 	--model_state_dict_path  $(MODEL_STATE_DICT_PATH) \
 	--model_name $(MODEL_NAME)
 else
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--runner DataflowRunner \
 	--job_name $(JOB_NAME) \
 	--project $(PROJECT_ID) \
@@ -158,7 +160,7 @@ run-df-cpu: ## Run a Dataflow job with CPUs and without Custom Container
 	@$(shell sed "s|\$${BEAM_VERSION}|$(BEAM_VERSION)|g" requirements.txt > beam-output/requirements.txt)
 	@$(eval JOB_NAME := beam-ml-starter-cpu-$(shell date +%s)-$(shell echo $$$$))
 ifeq ($(MODEL_ENV), "TORCH")
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--runner DataflowRunner \
 	--job_name $(JOB_NAME) \
 	--project $(PROJECT_ID) \
@@ -174,7 +176,7 @@ ifeq ($(MODEL_ENV), "TORCH")
 	--model_state_dict_path  $(MODEL_STATE_DICT_PATH) \
 	--model_name $(MODEL_NAME)
 else
-	time ./venv/bin/python3 -m src.run \
+	time ./venv/bin/python3 -m my_project.run \
 	--runner DataflowRunner \
 	--job_name $(JOB_NAME) \
 	--project $(PROJECT_ID) \
diff --git a/README.md b/README.md
@@ -41,8 +41,8 @@ newgrp docker
 ├── requirements.prod.txt   <- Packages for the production environment and produces `requirements.txt`
 ├── scripts                 <- utility bash scripts
 ├── setup.py                <- Used in `python setup.py sdist` to create the multi-file python package
-├── src                     <- Source code for use in this project
-│   ├── __init__.py         <- Makes src a Python module
+├── my_project              <- Source code for use in this project, also your python package module name
+│   ├── __init__.py         <- Makes my_project a Python package
 │   ├── config.py           <- `pydantic` model classes to define sources, sinks, and models
 │   ├── pipeline.py         <- Builds the Beam RunInference pipeline
 │   └── run.py              <- A run module to parse the command options and run the Beam pipeline
@@ -118,10 +118,10 @@ The entire code flows in this way:
 * `run.py` is called by the`Makefile` targets to parse the input arguments and set `ModelConfig`, `SourceConfig`, and `SinkConfig` defined in `config.py`, then calls `build_pipeline` from `pipeline.py` to build the final Beam pipeline
 
 
-To customize the pipeline, modify `build_pipeline` in [pipeline.py](https://github.com/google/dataflow-ml-starter/blob/main/src/pipeline.py). It defines how to read the image data from TextIO, pre-process the images, score them, post-process the predictions,
+To customize the pipeline, modify `build_pipeline` in [pipeline.py](https://github.com/google/dataflow-ml-starter/blob/main/my_project/pipeline.py). It defines how to read the image data from TextIO, pre-process the images, score them, post-process the predictions,
 and at last save the results using TextIO.
 
-[config.py](https://github.com/google/dataflow-ml-starter/blob/main/src/config.py) contains a set of `pydantic` models to specify the configurations for sources, sinks, and models and validate them. Users can easily add more Pytorch classification models. [Here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference) contains more examples.
+[config.py](https://github.com/google/dataflow-ml-starter/blob/main/my_project/config.py) contains a set of `pydantic` models to specify the configurations for sources, sinks, and models and validate them. Users can easily add more Pytorch classification models. [Here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference) contains more examples.
 
 ### `.env` Details
 
@@ -286,7 +286,7 @@ More importantly, building the flex templates container from the custom SDK cont
 
 Since the custom container is already created, it is straightforward to adapt Dataflow Flex Templates:
 1. create a [`metadata.json`](https://github.com/google/dataflow-ml-starter/blob/main/flex/metadata.json) file that contains the parameters required by your Beam pipeline. In this example, we can add `input`, `output`, `device`, `model_name`, `model_state_dict_path`, and `tf_model_uri` as the parameters that can be passed in by users. [Here](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates#example-metadata-file) is another example metadata file.
-2. convert the custom container to your template container following [this](https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images). [`tensorflow_gpu.flex.Dockerfile`](https://github.com/google/dataflow-ml-starter/blob/main/tensorflow_gpu.flex.Dockerfile) is one example converted from `tensorflow_gpu.Dockerfile`. Only two parts are needed: switch to the Dataflow Template launcher entrypoint and package `src` into this container. Change `CUSTOM_CONTAINER_IMAGE` in `.env` and run `make docker` to create the custom container for Flex Templates.
+2. convert the custom container to your template container following [this](https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images). [`tensorflow_gpu.flex.Dockerfile`](https://github.com/google/dataflow-ml-starter/blob/main/tensorflow_gpu.flex.Dockerfile) is one example converted from `tensorflow_gpu.Dockerfile`. Only two parts are needed: switch to the Dataflow Template launcher entrypoint and package `my_project` into this container. Change `CUSTOM_CONTAINER_IMAGE` in `.env` and run `make docker` to create the custom container for Flex Templates.
 3. `make create-flex-template` creates a template spec file in a Cloud Storage bucket defined by the env `TEMPLATE_FILE_GCS_PATH` that contains all of the necessary information to run the job, such as the SDK information and metadata. This calls the CLI `gcloud dataflow flex-template build`.
 4. `make run-df-gpu-flex` runs a Flex Template pipeline using the spec file from `TEMPLATE_FILE_GCS_PATH`. This calls the CLI `gcloud dataflow flex-template run`.
 
diff --git a/my_project/__init__.py b/my_project/__init__.py
diff --git a/my_project/config.py b/my_project/config.py
diff --git a/my_project/pipeline.py b/my_project/pipeline.py
@@ -31,10 +31,8 @@
 from PIL import Image
 from torchvision import models, transforms
 
-try:
-    from .config import ModelConfig, ModelName, SinkConfig, SourceConfig
-except ImportError:
-    from config import ModelConfig, ModelName, SinkConfig, SourceConfig
+# Dataflow ML libraries
+from my_project.config import ModelConfig, ModelName, SinkConfig, SourceConfig
 
 import tensorflow as tf  # isort:skip
 
diff --git a/my_project/run.py b/my_project/run.py
@@ -23,12 +23,9 @@
 from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions
 from apache_beam.runners.runner import PipelineResult
 
-try:
-    from .config import ModelConfig, SinkConfig, SourceConfig
-    from .pipeline import build_pipeline
-except ImportError:
-    from config import ModelConfig, SinkConfig, SourceConfig
-    from pipeline import build_pipeline
+# Dataflow ML libraries
+from my_project.config import ModelConfig, SinkConfig, SourceConfig
+from my_project.pipeline import build_pipeline
 
 
 def parse_known_args(argv):
diff --git a/scripts/check-pipeline.sh b/scripts/check-pipeline.sh
@@ -42,7 +42,7 @@ vm_ssh="gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PRO
 vm_scp="gcloud compute scp --strict-host-key-checking=no --project $PROJECT_ID --zone=$ZONE --quiet"
 
 # Package the local code and copy it to VM
-PACKAGE_NAME="src-0.0.1"
+PACKAGE_NAME="my_project-0.0.1"
 python3 setup.py sdist
 $vm_ssh "sudo rm -fr ~/*"
 $vm_scp dist/$PACKAGE_NAME.tar.gz data/openimage_10.txt $VM_NAME:~/
@@ -54,13 +54,13 @@ echo "Running the PyTorch model on GPU..."
 $vm_ssh "docker run --entrypoint /bin/bash \
 --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64   --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \
 --volume /home/\$USER/:/workspace/\$USER --privileged $CUSTOM_CONTAINER_IMAGE -c \
-\"cd \$USER/$PACKAGE_NAME; python -m src.run --input openimage_10.txt  --output beam-output/beam_test_out.txt  --model_state_dict_path  $MODEL_STATE_DICT_PATH --model_name $MODEL_NAME --device GPU\""
+\"cd \$USER/$PACKAGE_NAME; python -m my_project.run --input openimage_10.txt  --output beam-output/beam_test_out.txt  --model_state_dict_path  $MODEL_STATE_DICT_PATH --model_name $MODEL_NAME --device GPU\""
 else
 echo "Running the Tensorflow model on GPU..."
 $vm_ssh "docker run --entrypoint /bin/bash \
 --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64   --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \
 --volume /home/\$USER/:/workspace/\$USER --privileged $CUSTOM_CONTAINER_IMAGE -c \
-\"cd \$USER/$PACKAGE_NAME; python -m src.run --input openimage_10.txt  --output beam-output/beam_test_out.txt  --tf_model_uri $TF_MODEL_URI --device GPU\""
+\"cd \$USER/$PACKAGE_NAME; python -m my_project.run --input openimage_10.txt  --output beam-output/beam_test_out.txt  --tf_model_uri $TF_MODEL_URI --device GPU\""
 fi
 
 $vm_ssh "[ -f './$PACKAGE_NAME/beam-output/beam_test_out.txt' ] && echo 'The DirectRunner run succeeded on GPU!' || echo 'The DirectRunner run failed on GPU!'"
diff --git a/setup.py b/setup.py
@@ -24,8 +24,8 @@
         required = f.read().splitlines()
 
 setuptools.setup(
-    name="src",
+    name="my_project",
     version="0.0.1",
     install_requires=required,
-    packages=["src"],
+    packages=["my_project"],
 )
diff --git a/tensorflow_gpu.flex.Dockerfile b/tensorflow_gpu.flex.Dockerfile
@@ -45,16 +45,16 @@ RUN \
     && pip install --no-cache-dir torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
 
 # Copy the run module
-COPY src/ /workspace/src
-RUN rm -fr /workspace/src/__pycache__
+COPY my_project/ /workspace/my_project
+RUN rm -fr /workspace/my_project/__pycache__
 
 #Specifies which Python file to run to launch the Flex Template.
-ENV FLEX_TEMPLATE_PYTHON_PY_FILE="src/run.py"
+ENV FLEX_TEMPLATE_PYTHON_PY_FILE="my_project/run.py"
 
 # Since we already downloaded all the dependencies, there's no need to rebuild everything.
 ENV PIP_NO_DEPS=True
 
-ENV PYTHONPATH "${PYTHONPATH}:/workspace/src/"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/my_project/"
 
 # Copy the Dataflow Template launcher
 COPY --from=template_launcher /opt/google/dataflow/python_template_launcher /opt/google/dataflow/python_template_launcher
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -20,8 +20,8 @@
 
 # Dataflow ML libraries
 # dfml libraries
-from src.config import ModelConfig, SinkConfig, SourceConfig
-from src.pipeline import build_pipeline
+from my_project.config import ModelConfig, SinkConfig, SourceConfig
+from my_project.pipeline import build_pipeline
 
 DATA_FILE_PATH = Path(__file__).parent.parent / "data"