Skip to content

Commit 76387fc

Browse files
committed
add Dockerfile for containerized PDF-to-markdown conversion
1 parent edf7b88 commit 76387fc

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

library/scraping/Dockerfile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# to avoid OOM issues and give it a max amount of memory / CPUs, run this with
2+
# create image : docker build -t <image_name> -f scraping/Dockerfile .
3+
# run it docker run --memory="Ng" --memory-swap="Ng" --cpus="K" <image_name>
4+
5+
FROM python:3.13-slim
6+
7+
# Install wget for downloading files
8+
RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
9+
10+
# Install uv
11+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
12+
13+
WORKDIR /app
14+
15+
COPY pyproject.toml uv.lock* ./
16+
COPY .env ./
17+
18+
RUN uv sync --frozen --no-dev --group pdfscraping
19+
20+
COPY src/library ./library
21+
COPY scraping/extract_text_from_pdfs.py .
22+
23+
CMD ["uv", "run", "extract_text_from_pdfs.py", "--md"]

0 commit comments

Comments
 (0)