Skip to content

Commit e81d21b

Browse files
committed
parse page layout recursively
2 parents bc1cb6b + fbf6034 commit e81d21b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+50204
-43188
lines changed

.github/workflows/doc.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: pdf2docx-doc
2+
3+
on:
4+
push:
5+
tags:
6+
- 'v[0-9]+.[0-9]+.[0-9]+'
7+
8+
jobs:
9+
publish_doc:
10+
runs-on: ubuntu-18.04
11+
steps:
12+
- name: Check out code
13+
uses: actions/checkout@v2
14+
15+
- name: Set up Python 3.x
16+
uses: actions/setup-python@v1
17+
with:
18+
python-version: '3.x'
19+
20+
- name: Display Python version
21+
run: python -c "import sys; print(sys.version)"
22+
23+
- name: Install dependencies
24+
run: |
25+
python -m pip install --upgrade pip
26+
pip install sphinx sphinx_rtd_theme
27+
pip install -r requirements.txt
28+
python setup.py develop
29+
30+
# build package for tags, e.g. 3.2.1 extracted from 'refs/tags/v3.2.1'
31+
- name: Create html doc
32+
run: |
33+
echo ${GITHUB_REF#refs/tags/v} > version.txt
34+
make doc
35+
36+
- name: Deploy
37+
uses: peaceiris/actions-gh-pages@v3
38+
with:
39+
github_token: ${{ secrets.GITHUB_TOKEN }}
40+
publish_dir: ./build/html

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*.txt
55
*.docx
66
layout.json
7+
.vscode/
78

89
# pdf testing files
910
*.pdf
@@ -16,4 +17,5 @@ feature-*/
1617
# building dir
1718
build/
1819
dist/
19-
*egg-info/
20+
*egg-info/
21+
pdf2docx*.rst

MANIFEST.in

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
include *.md
22
include LICENSE*
33
include requirements.txt
4-
recursive-include test *.py *.pdf
4+
prune test
5+
include test/*.py
6+
include test/samples/*.pdf

Makefile

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Project makefile
2+
3+
# working directories and files
4+
#
5+
TOPDIR :=$(shell pwd)
6+
SRC :=$(TOPDIR)/pdf2docx
7+
BUILD :=$(TOPDIR)/build
8+
DOCSRC :=$(TOPDIR)/doc
9+
TEST :=$(TOPDIR)/test
10+
CLEANDIRS :=.pytest_cache pdf2docx.egg-info dist
11+
12+
# pip install sphinx_rtd_theme
13+
14+
.PHONY: src doc test clean
15+
16+
src:
17+
@python setup.py sdist --formats=gztar,zip && \
18+
python setup.py bdist_wheel
19+
20+
doc:
21+
@if [ -f "$(DOCSRC)/Makefile" ] ; then \
22+
( cd "$(DOCSRC)" && make html MODULEDIR="$(SRC)" BUILDDIR="$(BUILD)" ) || exit 1 ; \
23+
fi
24+
25+
test:
26+
@pytest -v "$(TEST)/test.py" --cov="$(SRC)" --cov-report=xml
27+
28+
clean:
29+
@if [ -e "$(DOCSRC)/Makefile" ] ; then \
30+
( cd "$(DOCSRC)" && make $@ BUILDDIR="$(BUILD)" ) || exit 1 ; \
31+
fi
32+
@for p in $(CLEANDIRS) ; do \
33+
if [ -d "$(TOPDIR)/$$p" ]; then rm -rf "$(TOPDIR)/$$p" ; fi ; \
34+
done
35+
@if [ -d "$(BUILD)" ]; then rm -rf "$(BUILD)" ; fi
36+
@if [ -d "$(DOCTARGET)" ]; then rm -rf "$(DOCTARGET)" ; fi

README.md

Lines changed: 8 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -44,153 +44,14 @@
4444
- no word transformation, e.g. rotation
4545

4646

47-
## Installation
48-
49-
### From Pypi
50-
51-
```
52-
$ pip install pdf2docx
53-
```
54-
55-
### From source code
56-
57-
Clone or download this project, and navigate to the root directory:
58-
59-
```
60-
$ python setup.py install
61-
```
62-
63-
Or install it in developing mode:
64-
65-
```
66-
$ python setup.py develop
67-
```
68-
69-
### Uninstall
70-
71-
```
72-
$ pip uninstall pdf2docx
73-
```
74-
75-
## Usage
76-
77-
`pdf2docx` can be used as either CLI or a library.
78-
79-
### Command Line Interface
80-
81-
```
82-
$ pdf2docx --help
83-
84-
NAME
85-
pdf2docx - Command line interface for pdf2docx.
86-
87-
SYNOPSIS
88-
pdf2docx COMMAND | -
89-
90-
DESCRIPTION
91-
Command line interface for pdf2docx.
92-
93-
COMMANDS
94-
COMMAND is one of the following:
95-
96-
convert
97-
Convert pdf file to docx file.
98-
99-
debug
100-
Convert one PDF page and plot layout information for debugging.
101-
102-
table
103-
Extract table content from pdf pages.
104-
```
105-
106-
- By range of pages
107-
108-
Specify pages range by `--start` (from the first page if omitted) and `--end` (to the last page if omitted). Note the page index is zero-based by default, but can turn it off by `--zero_based_index=False`, i.e. the first page index starts from 1.
109-
110-
111-
```bash
112-
$ pdf2docx convert test.pdf test.docx # all pages
113-
114-
$ pdf2docx convert test.pdf test.docx --start=1 # from the second page to the end
115-
116-
$ pdf2docx convert test.pdf test.docx --end=3 # from the first page to the third (index=2)
117-
118-
$ pdf2docx convert test.pdf test.docx --start=1 --end=3 # the second and third pages
119-
120-
$ pdf2docx convert test.pdf test.docx --start=1 --end=3 --zero_based_index=False # the first and second pages
121-
122-
```
123-
124-
- By page numbers
125-
126-
```bash
127-
$ pdf2docx convert test.pdf test.docx --pages=0,2,4 # the first, third and 5th pages
128-
```
129-
130-
- Multi-Processing
131-
132-
```bash
133-
$ pdf2docx convert test.pdf test.docx --multi_processing=True # default count of CPU
134-
135-
$ pdf2docx convert test.pdf test.docx --multi_processing=True --cpu_count=4
136-
```
137-
138-
139-
### Python Library
140-
141-
We can use either the `Converter` class or a wrapped method `parse()`.
142-
143-
- `Converter`
144-
145-
```python
146-
from pdf2docx import Converter
147-
148-
pdf_file = '/path/to/sample.pdf'
149-
docx_file = 'path/to/sample.docx'
150-
151-
# convert pdf to docx
152-
cv = Converter(pdf_file)
153-
cv.convert(docx_file, start=0, end=None)
154-
cv.close()
155-
```
156-
157-
158-
- Wrapped method `parse()`
159-
160-
```python
161-
from pdf2docx import parse
162-
163-
pdf_file = '/path/to/sample.pdf'
164-
docx_file = 'path/to/sample.docx'
165-
166-
# convert pdf to docx
167-
parse(pdf_file, docx_file, start=0, end=None)
168-
```
169-
170-
Or just to extract tables,
171-
172-
```python
173-
from pdf2docx import Converter
174-
175-
pdf_file = '/path/to/sample.pdf'
176-
177-
cv = Converter(pdf_file)
178-
tables = cv.extract_tables(start=0, end=1)
179-
cv.close()
180-
181-
for table in tables:
182-
print(table)
183-
184-
# outputs
185-
...
186-
[['Input ', None, None, None, None, None],
187-
['Description A ', 'mm ', '30.34 ', '35.30 ', '19.30 ', '80.21 '],
188-
['Description B ', '1.00 ', '5.95 ', '6.16 ', '16.48 ', '48.81 '],
189-
['Description C ', '1.00 ', '0.98 ', '0.94 ', '1.03 ', '0.32 '],
190-
['Description D ', 'kg ', '0.84 ', '0.53 ', '0.52 ', '0.33 '],
191-
['Description E ', '1.00 ', '0.15 ', None, None, None],
192-
['Description F ', '1.00 ', '0.86 ', '0.37 ', '0.78 ', '0.01 ']]
193-
```
47+
## Documentation
48+
49+
- [Installation](https://dothinking.github.io/pdf2docx/installation.html)
50+
- [Quickstart](https://dothinking.github.io/pdf2docx/quickstart.html)
51+
- [Convert PDF](https://dothinking.github.io/pdf2docx/quickstart.convert.html)
52+
- [Extract table content](https://dothinking.github.io/pdf2docx/quickstart.table.html)
53+
- [Command Line Interface](https://dothinking.github.io/pdf2docx/quickstart.cli.html)
54+
- [API Documentation](https://dothinking.github.io/pdf2docx/modules.html)
19455

19556
## Sample
19657

doc/Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Minimal makefile for Sphinx documentation
2+
#
3+
4+
# MODULEDIR and BUILDDIR are set in top makefile
5+
SOURCEDIR = .
6+
TARGETDIR = doctrees html
7+
8+
.PHONY: html clean
9+
10+
html: Makefile
11+
@sphinx-apidoc --separate -o "$(SOURCEDIR)" "$(MODULEDIR)" && \
12+
sphinx-build -M html "$(SOURCEDIR)" "$(BUILDDIR)"
13+
14+
clean:
15+
@for p in $(TARGETDIR) ; do \
16+
if [ -d "$(BUILDDIR)/$$p" ]; then rm -rf "$(BUILDDIR)/$$p" ; fi ; \
17+
done
18+
@if [ -e modules.rst ]; then rm pdf2docx*.rst ; fi

doc/conf.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Configuration file for the Sphinx documentation builder.
2+
#
3+
# This file only contains a selection of the most common options. For a full
4+
# list see the documentation:
5+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
6+
7+
# -- Path setup --------------------------------------------------------------
8+
9+
# If extensions (or modules to document with autodoc) are in another directory,
10+
# add these directories to sys.path here. If the directory is relative to the
11+
# documentation root, use os.path.abspath to make it absolute, like shown here.
12+
#
13+
import os
14+
import sys
15+
sys.path.insert(0, os.path.abspath("../pdf2docx/"))
16+
17+
18+
# -- Project information -----------------------------------------------------
19+
20+
project = 'pdf2docx'
21+
copyright = '2021, dothinking'
22+
author = 'dothinking'
23+
24+
# The full version, including alpha/beta/rc tags
25+
# read version number from version.txt, otherwise alpha version
26+
# Github CI can create version.txt dynamically.
27+
def get_version(fname):
28+
if os.path.exists(fname):
29+
with open(fname, 'r') as f:
30+
version = f.readline().strip()
31+
else:
32+
version = 'alpha'
33+
34+
return version
35+
release = get_version('../version.txt')
36+
37+
# -- General configuration ---------------------------------------------------
38+
39+
# Add any Sphinx extension module names here, as strings. They can be
40+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
41+
# ones.
42+
extensions = [
43+
'sphinx.ext.autodoc',
44+
'sphinx.ext.napoleon'
45+
]
46+
47+
# Add any paths that contain templates here, relative to this directory.
48+
# templates_path = ['_templates']
49+
50+
# List of patterns, relative to source directory, that match files and
51+
# directories to ignore when looking for source files.
52+
# This pattern also affects html_static_path and html_extra_path.
53+
exclude_patterns = [
54+
]
55+
56+
57+
# -- Options for HTML output -------------------------------------------------
58+
59+
# The theme to use for HTML and HTML Help pages. See the documentation for
60+
# a list of builtin themes.
61+
#
62+
# html_theme = 'alabaster'
63+
html_theme = 'sphinx_rtd_theme'
64+
65+
66+
# Add any paths that contain custom static files (such as style sheets) here,
67+
# relative to this directory. They are copied after the builtin static files,
68+
# so a file named "default.css" will overwrite the builtin "default.css".
69+
# html_static_path = ['_static']
70+

doc/index.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Welcome to pdf2docx's documentation!
2+
====================================
3+
4+
``pdf2docx`` is a Python library to parse PDF file with ``PyMuPDF`` and generate docx file with ``python-docx``.
5+
6+
7+
.. toctree::
8+
:maxdepth: 2
9+
:caption: Contents:
10+
11+
installation
12+
quickstart
13+
modules
14+
15+
16+
Indices and tables
17+
==================
18+
19+
* :ref:`genindex`
20+
* :ref:`modindex`
21+
* :ref:`search`

0 commit comments

Comments
 (0)