Skip to content

Commit 29a94d3

Browse files
authored
[Paddle-pipelines] update pdf (#7737)
* update pdf * update pdf * update pdf * update pdf * update pdf * update pdf
1 parent 57504e7 commit 29a94d3

File tree

3 files changed

+49
-15
lines changed

3 files changed

+49
-15
lines changed

pipelines/pipelines/nodes/file_converter/pdf.py

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import functools
1617
import logging
18+
import multiprocessing
1719
import os
1820
import tempfile
1921
from pathlib import Path
2022
from typing import Any, Dict, List, Optional
2123

22-
import pdfplumber
24+
import pypdf
2325

2426
try:
2527
from pdf2image import convert_from_path
@@ -33,6 +35,27 @@
3335
logger = logging.getLogger(__name__)
3436

3537

38+
def extract_pages(page_list, file_path):
39+
start = page_list[0]
40+
end = page_list[1]
41+
page_text = []
42+
pdf = pypdf.PdfReader(file_path)
43+
for page in pdf.pages[start:end]:
44+
paragraphs = page.extract_text()
45+
page_text.append(paragraphs)
46+
return page_text
47+
48+
49+
def run_process(pages, file_path, process_num=2):
50+
process_num = min(os.cpu_count(), process_num)
51+
pool = multiprocessing.Pool(process_num)
52+
extract_pages_c = functools.partial(extract_pages, file_path=file_path)
53+
result = pool.map_async(extract_pages_c, pages)
54+
pool.close()
55+
pool.join()
56+
return result.get()
57+
58+
3659
class PDFToTextConverter(BaseConverter):
3760
def __init__(
3861
self,
@@ -61,16 +84,18 @@ def __init__(
6184
def convert(
6285
self,
6386
file_path: Path,
87+
process_num: int = 20,
6488
meta: Optional[Dict[str, str]] = None,
6589
remove_numeric_tables: Optional[bool] = None,
6690
valid_languages: Optional[List[str]] = None,
6791
language: Optional[str] = "en",
6892
**kwargs: Any,
6993
) -> List[Dict[str, Any]]:
7094
"""
71-
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
95+
Extract text from a .pdf file using the pypdf library (https://pybrary.net/pyPdf/)
7296
7397
:param file_path: Path to the .pdf file you want to convert
98+
:param process_num: Number of processes
7499
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
75100
Can be any custom keys and values.
76101
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
@@ -84,8 +109,7 @@ def convert(
84109
not one of the valid languages, then it might likely be encoding error resulting
85110
in garbled text.
86111
"""
87-
88-
pages = self._read_pdf(file_path, layout=False)
112+
pages = self._read_pdf(file_path, layout=False, process_num=process_num)
89113
if remove_numeric_tables is None:
90114
remove_numeric_tables = self.remove_numeric_tables
91115
if valid_languages is None:
@@ -136,20 +160,30 @@ def convert(
136160
document = {"content": text, "content_type": "text", "meta": meta}
137161
return [document]
138162

139-
def _read_pdf(self, file_path: Path, layout: bool) -> List[str]:
163+
def _read_pdf(self, file_path: Path, layout: bool, process_num: int) -> List[str]:
140164
"""
141165
Extract pages from the pdf file at file_path.
142166
143167
:param file_path: path of the pdf file
144168
:param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
145169
the content stream order.
170+
::param process_num: Number of processes
146171
"""
147-
pdf = pdfplumber.open(file_path)
148-
page_text = []
149-
for page in pdf.pages:
150-
paragraphs = page.extract_text()
151-
page_text.append(paragraphs)
152-
return page_text
172+
if process_num > os.cpu_count():
173+
logger.warning("The number of processes cannot exceed the number of cups")
174+
process_num = os.cpu_count()
175+
pdf = pypdf.PdfReader(file_path)
176+
page_length = len(pdf.pages)
177+
split_len = page_length // process_num
178+
page_list = [i for i in range(0, page_length, split_len)]
179+
if page_length > page_list[-1]:
180+
page_list.append(page_length)
181+
page_combination = [(start, end) for start, end in zip(page_list, page_list[1:])]
182+
page_text = run_process(page_combination, file_path, process_num)
183+
page_text_all = []
184+
for item in page_text:
185+
page_text_all.extend(item)
186+
return page_text_all
153187

154188

155189
class PDFToTextOCRConverter(BaseConverter):

pipelines/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ sqlalchemy_utils
1111
langdetect
1212
python-docx
1313
nltk
14-
pdfplumber
1514
faiss-cpu>=1.7.2
1615
opencv-python>=4.4
1716
opencv-contrib-python-headless
@@ -26,4 +25,5 @@ spacy
2625
tritonclient[all]
2726
typing_extensions==4.5.0
2827
aistudio_sdk
29-
markdown
28+
markdown
29+
pypdf

pipelines/tests/nodes/file_converter/test_pdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ def test_conversion(self):
2525

2626
expected_result = [
2727
{
28-
"content": "A Simple PDF File\nThis is a small demonstration .pdf file -\njust for use in the Virtual Mechanics tutorials. More text. And more\ntext. And more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. Boring, zzzzz. And more text. And more text. And\nmore text. And more text. And more text. And more text. And more text.\nAnd more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. And more text. And more text. Even more. Continued on page 2 ...\x0cSimple PDF File 2\n...continued from page 1. Yet more text. And more text. And more text.\nAnd more text. And more text. And more text. And more text. And more\ntext. Oh, how boring typing this stuff. But not as boring as watching\npaint dry. And more text. And more text. And more text. And more text.\nBoring. More, a little more text. The end, and just as well.",
28+
"content": " A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\x0c Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. ",
2929
"content_type": "text",
3030
"meta": None,
3131
}
3232
]
33-
result = converter.convert(file_path)
33+
result = converter.convert(file_path, process_num=1)
3434
self.assertEqual(expected_result, result)

0 commit comments

Comments
 (0)