chore: no chipper model in docker build (#168)

cragwolfe · web-flow · commit d75163371896 · 2023-07-31T11:26:08.000-07:00
Previously, the Dockerfile was attempting to bake in the 2.6GB chipper model, but was no longer baking in the default detectron2 model. This was leading to errors when attempting to build the image in CI due to size issues. In addition, it's actually better to pre-load detectron2 in the first place rather than Chipper since it is the preferred "hi_res" model (as used to be the case).

This PR corrects that by only baking in the detectron2 image.

Since the chipper model is no longer baked into the image, there is the option of warming/downloading the chipper model when the application starts in scripts/app-start.sh. By default, it is not downloaded.*

*If it is not downloaded, it will be loaded on demand the first time it is requested. Generally, the download can take about a minute so that first request would be extra slow.
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -52,7 +52,7 @@ jobs:
       SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
     steps:
     - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v2
+      uses: docker/setup-buildx-action@v1
       with:
         driver: ${{ matrix.docker-platform == 'linux/amd64' && 'docker' || 'docker-container' }}
     - name: Checkout code
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.0.33
+
+* Image tweak, move application entrypoint to scripts/app-start.sh
+
 ## 0.0.32
 
 * Throw 400 error if a PDF is password protected
diff --git a/Dockerfile b/Dockerfile
@@ -29,26 +29,24 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} \
   && su -l ${NB_USER} -c 'pip3.8 install  --no-cache  -r requirements-base.txt' \
   && dnf -y groupremove "Development Tools" \
   && dnf clean all \
-  && ln -s /home/notebook-user/.local/bin/pip /usr/local/bin/pip || true
+  && ln -s /home/notebook-user/.local/bin/pip3.8 /usr/local/bin/pip3.8 || true
 
 USER ${NB_USER}
 
 FROM python-deps as model-deps
 RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
   python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
-   UNSTRUCTURED_HI_RES_SUPPORTED_MODEL=chipper python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
-
+  python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
 
 FROM model-deps as code
 COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
 COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
 COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
 COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks
 COPY --chown=${NB_USER}:${NB_USER} pipeline-notebooks pipeline-notebooks
+COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh
 
-ENTRYPOINT ["uvicorn", "prepline_general.api.app:app", \
-  "--log-config", "logger_config.yaml", \
-  "--host", "0.0.0.0"]
+ENTRYPOINT ["scripts/app-start.sh"]
 # Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
 # but some tooling will inspect containers and perform work contingent on networking support declared.
 EXPOSE 8000
diff --git a/Makefile b/Makefile
@@ -78,7 +78,7 @@ docker-build:
 
 .PHONY: docker-start-api
 docker-start-api:
-	docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest --log-config logger_config.yaml --host 0.0.0.0 --port 8000
+	docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -it --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest scripts/app-start.sh
 
 # Note(austin) we need to install the dev dependencies for this to work
 # Do we want to build separate dev images?
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -479,7 +479,7 @@ def return_content_type(filename):
 
 
 @router.post("/general/v0/general")
-@router.post("/general/v0.0.32/general")
+@router.post("/general/v0.0.33/general")
 def pipeline_1(
     request: Request,
     gz_uncompressed_content_type: Optional[str] = Form(default=None),
diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.32
+version: 0.0.33
diff --git a/scripts/app-start.sh b/scripts/app-start.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash 
+
+UNSTRUCTURED_DOWNLOAD_CHIPPER=${UNSTRUCTURED_DOWNLOAD_CHIPPER:-"false"}
+
+if [[ "$(echo "${UNSTRUCTURED_DOWNLOAD_CHIPPER}" | tr '[:upper:]' '[:lower:]')" == "true" ]]; then
+  echo "warming chipper model"
+  # NOTE(crag): in the cloud, this could add a minute to startup time
+  UNSTRUCTURED_HI_RES_SUPPORTED_MODEL=chipper python3.8 -c \
+    "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
+fi
+
+uvicorn prepline_general.api.app:app \
+	--log-config logger_config.yaml \
+        --host 0.0.0.0
diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh
@@ -30,13 +30,14 @@ start_container() {
 
     echo Starting container "$name"
     docker run -p "$port":"$port" \
+	   --entrypoint uvicorn \
            -d \
            --rm \
            --name "$name" \
            --env "UNSTRUCTURED_PARALLEL_MODE_URL=http://localhost:$port/general/v0/general" \
            --env "UNSTRUCTURED_PARALLEL_MODE_ENABLED=$use_parallel_mode" \
            "$DOCKER_IMAGE" \
-           --port "$port" --host 0.0.0.0
+           prepline_general.api.app:app --port "$port" --host 0.0.0.0
 }
 
 await_server_ready() {

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: general`
`2`		`-version: 0.0.32`
	`2`	`+version: 0.0.33`