Skip to content

Commit 79888d0

Browse files
feat: Extend the tests and demo to first download the model files from HF. Add the pytest in GitHub workflow ( #30)
* feat(devops): Introduce pytest in GitHub checks.yml workflow. Add PR template. Code styling for pre-commit. Remove devtools. * feat(poetry): Introduce huggingface_hub dependency in `toml`. Update `lock` file. * feat: Extend the tests and demo to first download the model files from HF. This removes the need of a manual download. Update README. --------- Signed-off-by: Nikos Livathinos <[email protected]>
1 parent be092c5 commit 79888d0

File tree

11 files changed

+107
-125
lines changed

11 files changed

+107
-125
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<!-- Thank you for contributing to docling-ibm-models! -->
2+
3+
<!-- STEPS TO FOLLOW:
4+
1. Add a description of the changes (frequently the same as the commit description)
5+
2. Enter the issue number next to "Resolves #" below (if there is no tracking issue resolved, **remove that section**)
6+
3. Follow the steps in the checklist below, starting with the **Commit Message Formatting**.
7+
-->
8+
9+
<!-- Uncomment this section with the issue number if an issue is being resolved
10+
**Issue resolved by this Pull Request:**
11+
Resolves #
12+
--->
13+
14+
**Checklist:**
15+
16+
- [ ] **Commit Message Formatting**: Commit titles and messages follow guidelines in the
17+
[conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary).
18+
- [ ] Documentation has been updated, if necessary.
19+
- [ ] Examples have been added, if necessary.
20+
- [ ] Tests have been added, if necessary.
21+

.github/workflows/checks.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@ jobs:
1414
python-version: ${{ matrix.python-version }}
1515
- name: Run styling check
1616
run: poetry run pre-commit run --all-files
17+
- name: Install with poetry
18+
run: poetry install --all-extras
19+
- name: Testing
20+
run: poetry run pytest -v tests

.pre-commit-config.yaml

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,19 @@ fail_fast: true
22
repos:
33
- repo: local
44
hooks:
5-
- id: system
5+
- id: black
66
name: Black
77
entry: poetry run black docling_ibm_models
88
pass_filenames: false
99
language: system
1010
files: '\.py$'
11-
- repo: local
12-
hooks:
13-
- id: system
11+
- id: isort
1412
name: isort
1513
entry: poetry run isort docling_ibm_models
1614
pass_filenames: false
1715
language: system
1816
files: '\.py$'
19-
- repo: local
20-
hooks:
21-
- id: system
17+
- id: poetry
2218
name: Poetry check
2319
entry: poetry lock --check
2420
pass_filenames: false
@@ -40,4 +36,4 @@ repos:
4036
# entry: poetry run mypy docling_ibm_models
4137
# pass_filenames: false
4238
# language: system
43-
# files: '\.py$'
39+
# files: '\.py$'

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ Below we list datasets used with their description, source, and ***"TableFormer
7676

7777
## Configuration file
7878

79-
Example configuration can be seen inside test `tests/test_tf_predictor.py`
79+
Example configuration can be found inside test `tests/test_tf_predictor.py`
8080
These are the main sections of the configuration file:
8181

8282
- `dataset`: The directory for prepared data and the parameters used during the data loading.
@@ -94,16 +94,13 @@ You can download the model weights and config files from the links:
9494
- [TableFormer Checkpoint](https://huggingface.co/ds4sd/docling-models/tree/main/model_artifacts/tableformer)
9595
- [beehive_v0.0.5](https://huggingface.co/ds4sd/docling-models/tree/main/model_artifacts/layout/beehive_v0.0.5)
9696

97-
Place the downloaded files into `tests/test_data/model_artifacts/` directory.
98-
9997

10098
## Inference Tests
10199

102-
This contains unit tests for Docling models.
100+
You can run the inference tests for the models with:
103101

104-
First download the model weights (see above), then run:
105102
```
106-
./devtools/check_code.sh
103+
python -m pytest tests/
107104
```
108105

109106
This will also generate prediction and matching visualizations that can be found here:

demo/demo_layout_predictor.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,10 @@
1111

1212
import numpy as np
1313
from PIL import Image, ImageDraw
14+
from huggingface_hub import snapshot_download
1415

1516
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
1617

17-
ARTIFACT_PATH = "tests/test_data/model_artifacts"
18-
1918

2019
def demo(
2120
logger: logging.Logger,
@@ -96,8 +95,12 @@ def main(args):
9695
# Ensure the viz dir
9796
Path(viz_dir).mkdir(parents=True, exist_ok=True)
9897

98+
# Download models from HF
99+
download_path = snapshot_download(repo_id="ds4sd/docling-models")
100+
artifact_path = os.path.join(download_path, "model_artifacts/layout/beehive_v0.0.5")
101+
99102
# Test the LayoutPredictor
100-
demo(logger, ARTIFACT_PATH, num_threads, img_dir, viz_dir)
103+
demo(logger, artifact_path, num_threads, img_dir, viz_dir)
101104

102105

103106
if __name__ == "__main__":

devtools/check_code.sh

Lines changed: 0 additions & 96 deletions
This file was deleted.

poetry.lock

Lines changed: 35 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Pillow = "^10.0.0"
3838
tqdm = "^4.64.0"
3939
mean_average_precision = "^2021.4.26.0"
4040
opencv-python-headless = { version = "^4.9.0.80" }
41+
huggingface_hub = ">=0.23,<1"
4142

4243
[tool.poetry.dev-dependencies]
4344
black = {extras = ["jupyter"], version = "^24.4.2"}

tests/test_data/model_artifacts/put_model_check_here.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/test_layout_predictor.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import pytest
99
from PIL import Image
1010

11+
from huggingface_hub import snapshot_download
12+
1113
import docling_ibm_models.layoutmodel.layout_predictor as lp
1214
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
1315

@@ -17,26 +19,20 @@ def init() -> dict:
1719
r"""
1820
Initialize the testing environment
1921
"""
22+
# This config is missing the keys: "artifact_path", "info1.onnx_file", "info2.onnx_file"
2023
init = {
21-
"artifact_path": "tests/test_data/model_artifacts/",
2224
"num_threads": 1,
2325
"test_imgs": [
2426
"tests/test_data/samples/ADS.2007.page_123.png",
2527
],
2628
"info1": {
27-
"onnx_file": os.path.join(
28-
"tests/test_data/model_artifacts/", lp.MODEL_CHECKPOINT_FN
29-
),
3029
"intra_op_num_threads": 2,
3130
"providers": ["CPUExecutionProvider"],
3231
"use_cpu_only": True,
3332
"image_size": 640,
3433
"threshold": 0.6,
3534
},
3635
"info2": {
37-
"onnx_file": os.path.join(
38-
"tests/test_data/model_artifacts/", lp.MODEL_CHECKPOINT_FN
39-
),
4036
"intra_op_num_threads": 1,
4137
"providers": ["CPUExecutionProvider"],
4238
"use_cpu_only": True,
@@ -45,6 +41,16 @@ def init() -> dict:
4541
},
4642
"pred_bboxes": 9,
4743
}
44+
45+
# Download models from HF
46+
download_path = snapshot_download(repo_id="ds4sd/docling-models")
47+
artifact_path = os.path.join(download_path, "model_artifacts/layout/beehive_v0.0.5")
48+
49+
# Add the missing config keys
50+
init["artifact_path"] = artifact_path
51+
init["info1"]["onnx_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN)
52+
init["info2"]["onnx_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN)
53+
4854
return init
4955

5056

0 commit comments

Comments
 (0)