Skip to content

Commit 77107fe

Browse files
committed
Sync branch with main
Signed-off-by: Christoph Auer <[email protected]>
2 parents f06664a + dd03b53 commit 77107fe

File tree

252 files changed

+29399
-8862218
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

252 files changed

+29399
-8862218
lines changed

.github/workflows/checks.yml

Lines changed: 271 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,91 +2,308 @@ on:
22
workflow_call:
33
inputs:
44
push_coverage:
5-
type: boolean
6-
description: "If true, the coverage results are pushed to codecov.io."
7-
default: true
5+
type: boolean
6+
description: "If true, the coverage results are pushed to codecov.io."
7+
default: true
88
secrets:
99
CODECOV_TOKEN:
10-
required: false
10+
required: false
1111

1212
env:
13-
HF_HUB_DOWNLOAD_TIMEOUT: "60"
14-
HF_HUB_ETAG_TIMEOUT: "60"
13+
HF_HUB_DOWNLOAD_TIMEOUT: "90"
14+
HF_HUB_ETAG_TIMEOUT: "90"
1515
UV_FROZEN: "1"
16+
PYTEST_ML: |-
17+
tests/test_e2e_conversion.py
18+
tests/test_e2e_ocr_conversion.py
19+
tests/test_backend_webp.py
20+
tests/test_asr_pipeline.py
21+
tests/test_threaded_pipeline.py
22+
PYTEST_TO_SKIP: |-
23+
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping)\.py$'
1624

1725
jobs:
18-
run-checks:
26+
lint:
1927
runs-on: ubuntu-latest
2028
strategy:
29+
fail-fast: false
2130
matrix:
22-
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
31+
python-version: ['3.12']
2332
steps:
24-
- uses: actions/checkout@v4
25-
- name: Install tesseract and ffmpeg
26-
run: sudo apt-get update && sudo apt-get install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
27-
- name: Set TESSDATA_PREFIX
28-
run: |
29-
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
30-
- name: Cache Hugging Face models
31-
uses: actions/cache@v4
32-
with:
33-
path: ~/.cache/huggingface
34-
key: huggingface-cache-py${{ matrix.python-version }}
33+
- uses: actions/checkout@v5
34+
3535
- name: Install uv and set the python version
36-
uses: astral-sh/setup-uv@v5
36+
uses: astral-sh/setup-uv@v6
3737
with:
3838
python-version: ${{ matrix.python-version }}
3939
enable-cache: true
40-
- name: pre-commit cache key
40+
41+
- name: Set pre-commit cache key
4142
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
42-
- uses: actions/cache@v4
43+
44+
- name: Cache pre-commit environments
45+
uses: actions/cache@v4
4346
with:
4447
path: ~/.cache/pre-commit
4548
key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
46-
- name: Install dependencies
49+
restore-keys: |
50+
pre-commit|${{ env.PY }}|
51+
52+
- name: Install Python Dependencies
4753
run: uv sync --frozen --all-extras
48-
- name: Check style and run tests
49-
run: pre-commit run --all-files
50-
- name: Testing
51-
run: |
52-
uv run --no-sync pytest -v --cov=docling --cov-report=xml tests
53-
- name: Upload coverage to Codecov
54-
if: inputs.push_coverage
55-
uses: codecov/codecov-action@v5
56-
with:
57-
token: ${{ secrets.CODECOV_TOKEN }}
58-
files: ./coverage.xml
59-
- name: Run examples
54+
55+
- name: Check style
6056
run: |
61-
for file in docs/examples/*.py; do
62-
# Skip batch_convert.py
63-
if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
64-
echo "Skipping $file"
65-
continue
57+
echo "--- Running pre-commit style checks ---"
58+
uv run pre-commit run --all-files
59+
60+
run-tests-1:
61+
runs-on: ubuntu-latest
62+
strategy:
63+
fail-fast: false
64+
matrix:
65+
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
66+
steps:
67+
- uses: actions/checkout@v5
68+
69+
- name: Grant permissions to APT cache directory # allows restore
70+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
71+
72+
- name: Cache APT packages
73+
id: apt-cache
74+
uses: actions/cache@v4
75+
with:
76+
path: /var/cache/apt/archives
77+
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
78+
restore-keys: |
79+
apt-packages-${{ runner.os }}-
80+
81+
- name: Install System Dependencies
82+
run: |
83+
sudo apt-get -qq update
84+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
85+
86+
- name: Set TESSDATA_PREFIX
87+
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
88+
89+
- name: Install uv and set the python version
90+
uses: astral-sh/setup-uv@v6
91+
with:
92+
python-version: ${{ matrix.python-version }}
93+
94+
- name: Install Python Dependencies
95+
run: uv sync --frozen --all-extras
96+
97+
- name: Cache Models
98+
uses: actions/cache@v4
99+
with:
100+
path: |
101+
~/.cache/huggingface
102+
~/.cache/modelscope
103+
~/.EasyOCR/
104+
key: models-cache
105+
106+
- name: Pre-download Models
107+
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
108+
109+
- name: Run tests for GROUP1
110+
run: |
111+
echo "--- Running tests ---"
112+
GROUP1=$(echo "$PYTEST_ML" | sed -e 's/^/--ignore=/' | tr '\n' ' ')
113+
echo "Running tests for GROUP1"
114+
uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP1
115+
116+
- name: Upload coverage to Codecov
117+
if: inputs.push_coverage
118+
uses: codecov/codecov-action@v5
119+
with:
120+
token: ${{ secrets.CODECOV_TOKEN }}
121+
files: ./coverage.xml
122+
flags: run-tests-1
123+
124+
- name: Grant permissions to APT cache directory # allows backup
125+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
126+
127+
run-tests-2:
128+
runs-on: ubuntu-latest
129+
strategy:
130+
fail-fast: false
131+
matrix:
132+
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
133+
steps:
134+
- uses: actions/checkout@v5
135+
136+
- name: Grant permissions to APT cache directory # allows restore
137+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
138+
139+
- name: Cache APT packages
140+
id: apt-cache
141+
uses: actions/cache@v4
142+
with:
143+
path: /var/cache/apt/archives
144+
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
145+
restore-keys: |
146+
apt-packages-${{ runner.os }}-
147+
148+
- name: Install System Dependencies
149+
run: |
150+
sudo apt-get -qq update
151+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
152+
153+
- name: Set TESSDATA_PREFIX
154+
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
155+
156+
- name: Install uv and set the python version
157+
uses: astral-sh/setup-uv@v6
158+
with:
159+
python-version: ${{ matrix.python-version }}
160+
161+
- name: Install Python Dependencies
162+
run: uv sync --frozen --all-extras
163+
164+
- name: Cache Models
165+
uses: actions/cache@v4
166+
with:
167+
path: |
168+
~/.cache/huggingface
169+
~/.cache/modelscope
170+
~/.EasyOCR/
171+
key: models-cache
172+
173+
- name: Pre-download Models
174+
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
175+
176+
- name: Run tests for GROUP2
177+
run: |
178+
echo "--- Running tests ---"
179+
GROUP2=$(echo "$PYTEST_ML" | tr '\n' ' ')
180+
echo "Running tests for GROUP2"
181+
DESELECT_OPT=""
182+
if [ -n "$PYTEST_TO_SKIP" ]; then
183+
DESELECT_OPT="--deselect $PYTEST_TO_SKIP"
66184
fi
185+
echo "Running tests for GROUP2"
186+
uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP2 $DESELECT_OPT
187+
188+
- name: Upload coverage to Codecov
189+
if: inputs.push_coverage
190+
uses: codecov/codecov-action@v5
191+
with:
192+
token: ${{ secrets.CODECOV_TOKEN }}
193+
files: ./coverage.xml
194+
flags: run-tests-2
195+
196+
- name: Grant permissions to APT cache directory # allows backup
197+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
198+
199+
run-examples:
200+
runs-on: ubuntu-latest
201+
strategy:
202+
fail-fast: false
203+
matrix:
204+
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
205+
steps:
206+
- uses: actions/checkout@v5
207+
208+
- name: Grant permissions to APT cache directory # allows restore
209+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
210+
211+
- name: Cache APT packages
212+
id: apt-cache
213+
uses: actions/cache@v4
214+
with:
215+
path: /var/cache/apt/archives
216+
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
217+
restore-keys: |
218+
apt-packages-${{ runner.os }}-
219+
220+
- name: Install System Dependencies
221+
run: |
222+
sudo apt-get -qq update
223+
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
224+
225+
- name: Set TESSDATA_PREFIX
226+
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
227+
228+
- name: Install uv and set the python version
229+
uses: astral-sh/setup-uv@v6
230+
with:
231+
python-version: ${{ matrix.python-version }}
232+
233+
- name: Install Python Dependencies
234+
run: uv sync --frozen --all-extras
67235

68-
echo "Running example $file"
69-
uv run --no-sync python "$file" || exit 1
70-
done
236+
- name: Cache Models
237+
uses: actions/cache@v4
238+
with:
239+
path: |
240+
~/.cache/huggingface
241+
~/.cache/modelscope
242+
~/.EasyOCR/
243+
key: models-cache
244+
245+
- name: Pre-download Models
246+
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
247+
248+
- name: Run examples
249+
run: |
250+
echo "--- Creating output directory ---"
251+
mkdir -p scratch
252+
253+
echo "--- Running examples ---"
254+
255+
summary_file="runtime_summary.log"
256+
echo "--- Example Runtimes ---" > "$summary_file"
257+
258+
for file in docs/examples/*.py; do
259+
if [[ "$(basename "$file")" =~ ${EXAMPLES_TO_SKIP} ]]; then
260+
echo "Skipping example: $(basename "$file")"
261+
else
262+
echo "--- Running example $(basename "$file") ---"
263+
264+
start_time=$SECONDS
265+
266+
uv run --no-sync python "$file" || exit 1
267+
duration=$((SECONDS - start_time))
268+
echo "Finished in ${duration}s."
269+
270+
echo "$(basename "$file"): ${duration}s" >> "$summary_file"
271+
fi
272+
done
273+
274+
echo
275+
echo "==================================="
276+
echo " Final Runtime Summary "
277+
echo "==================================="
278+
cat "$summary_file"
279+
echo "==================================="
280+
281+
- name: Grant permissions to APT cache directory # allows backup
282+
run: sudo chown -R $USER:$USER /var/cache/apt/archives
71283

72284
build-package:
73285
runs-on: ubuntu-latest
74286
strategy:
75287
matrix:
76288
python-version: ['3.12']
77289
steps:
78-
- uses: actions/checkout@v4
290+
- uses: actions/checkout@v5
291+
79292
- name: Install uv and set the python version
80-
uses: astral-sh/setup-uv@v5
293+
uses: astral-sh/setup-uv@v6
81294
with:
82295
python-version: ${{ matrix.python-version }}
83296
enable-cache: true
297+
84298
- name: Install dependencies
85299
run: uv sync --all-extras
300+
86301
- name: Build package
87302
run: uv build
303+
88304
- name: Check content of wheel
89305
run: unzip -l dist/*.whl
306+
90307
- name: Store the distribution packages
91308
uses: actions/upload-artifact@v4
92309
with:
@@ -106,12 +323,17 @@ jobs:
106323
with:
107324
name: python-package-distributions
108325
path: dist/
326+
109327
- name: Install uv and set the python version
110-
uses: astral-sh/setup-uv@v5
328+
uses: astral-sh/setup-uv@v6
111329
with:
112330
python-version: ${{ matrix.python-version }}
113-
enable-cache: true
331+
activate-environment: true
332+
enable-cache: false
333+
114334
- name: Install package
115-
run: uv pip install dist/*.whl
335+
run: |
336+
uv pip install dist/*.whl
337+
116338
- name: Run docling
117-
run: docling --help
339+
run: uv run docling --help

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,6 @@ repos:
2222
language: system
2323
files: '\.py$'
2424
- repo: https://github.com/astral-sh/uv-pre-commit
25-
rev: 0.7.8
25+
rev: 0.8.3
2626
hooks:
2727
- id: uv-lock

0 commit comments

Comments
 (0)