Skip to content

Commit 64870de

Browse files
authored
Update os dir naming for images (#157)
* Update os dir naming for images * update changelog * Add Docker approach to testing * Add unit test to check no ext case
1 parent 06c0057 commit 64870de

File tree

8 files changed

+82
-5
lines changed

8 files changed

+82
-5
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,5 @@ dmypy.json
141141

142142
# VSCode
143143
.vscode/
144+
145+
sample-docs/*_images

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.7
2+
3+
* hotfix to handle issue storing images in a new dir when the pdf has no file extension
4+
15
## 0.5.6
26

37
* Update the `annotate` and `_get_image_array` methods of `PageLayout` to get the image from the `image_path` property if the `image` property is `None`.

Dockerfile

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# syntax=docker/dockerfile:experimental
2+
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base
3+
4+
ARG PIP_VERSION
5+
6+
# Set up environment
7+
ENV HOME /home/
8+
WORKDIR ${HOME}
9+
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
10+
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
11+
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
12+
ENV PATH="/home/usr/.local/bin:${PATH}"
13+
14+
FROM base as deps
15+
# Copy and install Unstructured
16+
COPY requirements requirements
17+
18+
RUN python3.8 -m pip install pip==${PIP_VERSION} && \
19+
dnf -y groupinstall "Development Tools" && \
20+
pip install --no-cache -r requirements/base.txt && \
21+
pip install --no-cache -r requirements/test.txt && \
22+
pip install --no-cache -r requirements/dev.txt && \
23+
pip install "unstructured.PaddleOCR" && \
24+
dnf -y groupremove "Development Tools" && \
25+
dnf clean all
26+
27+
FROM deps as code
28+
ARG PACKAGE_NAME=unstructured_inference
29+
COPY unstructured_inference unstructured_inference
30+
31+
#CMD ["pytest -m \"not slow\" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing"]
32+
CMD ["/bin/bash"]
33+
#CMD ["bash -c pytest test_unstructured_inference"]

Makefile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
PACKAGE_NAME := unstructured_inference
22
PIP_VERSION := 23.1.2
3+
CURRENT_DIR := $(shell pwd)
34

45

56
.PHONY: help
@@ -116,3 +117,23 @@ version-sync:
116117
.PHONY: check-coverage
117118
check-coverage:
118119
coverage report --fail-under=95
120+
121+
##########
122+
# Docker #
123+
##########
124+
125+
# Docker targets are provided for convenience only and are not required in a standard development environment
126+
127+
DOCKER_IMAGE ?= unstructured-inference:dev
128+
129+
.PHONY: docker-build
130+
docker-build:
131+
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
132+
133+
.PHONY: docker-test
134+
docker-test: docker-build
135+
docker run --rm \
136+
-v ${CURRENT_DIR}/test_unstructured_inference:/home/test_unstructured_inference \
137+
-v ${CURRENT_DIR}/sample-docs:/home/sample-docs \
138+
$(DOCKER_IMAGE) \
139+
bash -c "pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured_inference"

scripts/docker-build.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
#!/usr/bin/env bash
22

33
set -euo pipefail
4+
PIP_VERSION="${PIP_VERSION:-23.1.2}"
5+
DOCKER_IMAGE="unstructured-inference:dev"
46

5-
DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \
7+
DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \
68
--build-arg PIP_VERSION="$PIP_VERSION" \
9+
--build-arg BUILDKIT_INLINE_CACHE=1 \
710
--progress plain \
8-
-t unstructured-inference-dev:latest .
11+
-t "$DOCKER_IMAGE" .)
12+
13+
DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}"

test_unstructured_inference/inference/test_layout.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,17 @@ def test_create_image_output_dir():
836836
with tempfile.TemporaryDirectory() as tmpdir:
837837
tmp_f_path = os.path.join(tmpdir, "loremipsum.pdf")
838838
output_dir = create_image_output_dir(tmp_f_path)
839-
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum")
839+
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_images")
840+
assert os.path.isdir(output_dir)
841+
assert os.path.isabs(output_dir)
842+
assert output_dir == expected_output_dir
843+
844+
845+
def test_create_image_output_dir_no_ext():
846+
with tempfile.TemporaryDirectory() as tmpdir:
847+
tmp_f_path = os.path.join(tmpdir, "loremipsum_no_ext")
848+
output_dir = create_image_output_dir(tmp_f_path)
849+
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_no_ext_images")
840850
assert os.path.isdir(output_dir)
841851
assert os.path.isabs(output_dir)
842852
assert output_dir == expected_output_dir
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.6" # pragma: no cover
1+
__version__ = "0.5.7" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,8 @@ def create_image_output_dir(
470470
directory path"""
471471
parent_dir = os.path.abspath(os.path.dirname(filename))
472472
f_name_without_extension = os.path.splitext(os.path.basename(filename))[0]
473-
output_dir = os.path.join(parent_dir, f_name_without_extension)
473+
474+
# Add a suffix to avoid conflicts in case original file doesn't have an extension
475+
output_dir = os.path.join(parent_dir, f"{f_name_without_extension}_images")
474476
os.makedirs(output_dir, exist_ok=True)
475477
return output_dir

0 commit comments

Comments
 (0)