Skip to content

Commit 1fccb29

Browse files
feat!: Massive quality improvements to v2 parser and new sanitize_cells API (#73)
Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 7706471 commit 1fccb29

File tree

325 files changed

+4695527
-5402417
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

325 files changed

+4695527
-5402417
lines changed

.github/scripts/build_rhel.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ sudo -E XDG_RUNTIME_DIR= podman build --progress=plain \
4040
--no-deps --no-build-isolation -w /dist/ \
4141
/src/docling_parse*.tar.gz \
4242
&& pip3.11 install /dist/docling_parse*.whl \
43-
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser_v1, pdf_parser_v2'
43+
&& python3.11 -c 'from docling_parse.pdf_parsers import pdf_parser_v1, pdf_parser_v2'
4444
4545
COPY ./tests /src/tests
4646

.pre-commit-config.yaml

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,42 @@ repos:
44
hooks:
55
- id: system
66
name: Black
7-
entry: poetry run black docling_parse
7+
entry: poetry run black docling_parse tests
88
pass_filenames: false
99
language: system
1010
files: '\.py$'
1111
- repo: local
1212
hooks:
1313
- id: system
1414
name: isort
15-
entry: poetry run isort docling_parse
15+
entry: poetry run isort docling_parse tests
1616
pass_filenames: false
1717
language: system
1818
files: '\.py$'
19+
- repo: local
20+
hooks:
21+
- id: autoflake
22+
name: autoflake
23+
entry: poetry run autoflake docling_parse
24+
pass_filenames: false
25+
language: system
26+
files: '\.py$'
27+
- repo: local
28+
hooks:
29+
- id: mypy
30+
name: MyPy
31+
entry: poetry run mypy docling_parse tests
32+
pass_filenames: false
33+
language: system
34+
files: '\.py$'
35+
# - repo: local
36+
# hooks:
37+
# - id: pytest
38+
# name: Pytest
39+
# entry: poetry run pytest tests/
40+
# pass_filenames: false
41+
# language: system
42+
# files: '\.py$'
1943
- repo: local
2044
hooks:
2145
- id: system

CMakeLists.txt

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,18 +192,24 @@ target_link_libraries(parse_v2 ${DEPENDENCIES} ${LIB_LINK})
192192
# https://pybind11.readthedocs.io/en/stable/compiling.html#configuration-variables
193193
find_package(pybind11 CONFIG REQUIRED)
194194

195-
pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
195+
#pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
196+
pybind11_add_module(pdf_parsers "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
196197

197-
add_dependencies(docling_parse parse_v1 parse_v2)
198+
#add_dependencies(docling_parse parse_v1 parse_v2)
199+
add_dependencies(pdf_parsers parse_v1 parse_v2)
198200

199-
target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})
201+
#target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})
202+
target_include_directories(pdf_parsers INTERFACE ${DEPENDENCIES})
200203

201-
target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
204+
#target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
205+
target_compile_definitions(pdf_parsers PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
202206

203-
target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)
207+
#target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)
208+
target_link_libraries(pdf_parsers PRIVATE parse_v1 parse_v2)
204209

205210
# *****************
206211
# *** Install ***
207212
# *****************
208213

209-
install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
214+
#install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
215+
install(TARGETS pdf_parsers DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")

README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,7 @@ Convert a PDF (look in the [visualise.py](docling_parse/visualise.py) for a more
4848
from docling_parse.docling_parse import pdf_parser_v2
4949

5050
# Do this only once to load fonts (avoid initialising it many times)
51-
parser = pdf_parser_v2()
52-
53-
# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info
51+
parser = pdf_parser_v2("error") # info, warning, error, fatal
5452

5553
doc_file = "my-doc.pdf" # filename
5654
doc_key = f"key={pdf_doc}" # unique document key (eg hash, UUID, etc)
@@ -167,7 +165,7 @@ If you dont have an input file, then a template input file will be printed on th
167165
To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)),
168166

169167
```
170-
poetry build
168+
poetry install
171169
```
172170

173171
To test the package, run:

0 commit comments

Comments
 (0)