Skip to content

Commit d751633

Browse files
authored
chore: no chipper model in docker build (#168)
Previously, the Dockerfile was attempting to bake in the 2.6GB chipper model, but was no longer baking in the default detectron2 model. This was leading to errors when attempting to build the image in CI due to size issues. In addition, it's actually better to pre-load detectron2 in the first place rather than Chipper since it is the preferred "hi_res" model (as used to be the case). This PR corrects that by only baking in the detectron2 image. Since the chipper model is no longer baked into the image, there is the option of warming/downloading the chipper model when the application starts in scripts/app-start.sh. By default, it is not downloaded.* *If it is not downloaded, it will be loaded on demand the first time it is requested. Generally, the download can take about a minute so that first request would be extra slow.
1 parent c5b90ec commit d751633

File tree

8 files changed

+28
-11
lines changed

8 files changed

+28
-11
lines changed

.github/workflows/docker-publish.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
5353
steps:
5454
- name: Set up Docker Buildx
55-
uses: docker/setup-buildx-action@v2
55+
uses: docker/setup-buildx-action@v1
5656
with:
5757
driver: ${{ matrix.docker-platform == 'linux/amd64' && 'docker' || 'docker-container' }}
5858
- name: Checkout code

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.33
2+
3+
* Image tweak, move application entrypoint to scripts/app-start.sh
4+
15
## 0.0.32
26

37
* Throw 400 error if a PDF is password protected

Dockerfile

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,24 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} \
2929
&& su -l ${NB_USER} -c 'pip3.8 install --no-cache -r requirements-base.txt' \
3030
&& dnf -y groupremove "Development Tools" \
3131
&& dnf clean all \
32-
&& ln -s /home/notebook-user/.local/bin/pip /usr/local/bin/pip || true
32+
&& ln -s /home/notebook-user/.local/bin/pip3.8 /usr/local/bin/pip3.8 || true
3333

3434
USER ${NB_USER}
3535

3636
FROM python-deps as model-deps
3737
RUN python3.8 -c "import nltk; nltk.download('punkt')" && \
3838
python3.8 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
39-
UNSTRUCTURED_HI_RES_SUPPORTED_MODEL=chipper python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
40-
39+
python3.8 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
4140

4241
FROM model-deps as code
4342
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
4443
COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
4544
COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
4645
COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks
4746
COPY --chown=${NB_USER}:${NB_USER} pipeline-notebooks pipeline-notebooks
47+
COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh
4848

49-
ENTRYPOINT ["uvicorn", "prepline_general.api.app:app", \
50-
"--log-config", "logger_config.yaml", \
51-
"--host", "0.0.0.0"]
49+
ENTRYPOINT ["scripts/app-start.sh"]
5250
# Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
5351
# but some tooling will inspect containers and perform work contingent on networking support declared.
5452
EXPOSE 8000

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ docker-build:
7878

7979
.PHONY: docker-start-api
8080
docker-start-api:
81-
docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest --log-config logger_config.yaml --host 0.0.0.0 --port 8000
81+
docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -it --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest scripts/app-start.sh
8282

8383
# Note(austin) we need to install the dev dependencies for this to work
8484
# Do we want to build separate dev images?

prepline_general/api/general.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def return_content_type(filename):
479479

480480

481481
@router.post("/general/v0/general")
482-
@router.post("/general/v0.0.32/general")
482+
@router.post("/general/v0.0.33/general")
483483
def pipeline_1(
484484
request: Request,
485485
gz_uncompressed_content_type: Optional[str] = Form(default=None),

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.32
2+
version: 0.0.33

scripts/app-start.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env bash
2+
3+
UNSTRUCTURED_DOWNLOAD_CHIPPER=${UNSTRUCTURED_DOWNLOAD_CHIPPER:-"false"}
4+
5+
if [[ "$(echo "${UNSTRUCTURED_DOWNLOAD_CHIPPER}" | tr '[:upper:]' '[:lower:]')" == "true" ]]; then
6+
echo "warming chipper model"
7+
# NOTE(crag): in the cloud, this could add a minute to startup time
8+
UNSTRUCTURED_HI_RES_SUPPORTED_MODEL=chipper python3.8 -c \
9+
"from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
10+
fi
11+
12+
uvicorn prepline_general.api.app:app \
13+
--log-config logger_config.yaml \
14+
--host 0.0.0.0

scripts/docker-smoke-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,14 @@ start_container() {
3030

3131
echo Starting container "$name"
3232
docker run -p "$port":"$port" \
33+
--entrypoint uvicorn \
3334
-d \
3435
--rm \
3536
--name "$name" \
3637
--env "UNSTRUCTURED_PARALLEL_MODE_URL=http://localhost:$port/general/v0/general" \
3738
--env "UNSTRUCTURED_PARALLEL_MODE_ENABLED=$use_parallel_mode" \
3839
"$DOCKER_IMAGE" \
39-
--port "$port" --host 0.0.0.0
40+
prepline_general.api.app:app --port "$port" --host 0.0.0.0
4041
}
4142

4243
await_server_ready() {

0 commit comments

Comments
 (0)