1+ # Copyright 2021 The Layout Parser team. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ from typing import List , Union , Optional , Dict , Tuple
16+
17+ import pdfplumber
18+ import pandas as pd
19+
20+ from ..elements import Layout
21+ from .basic import load_dataframe
22+
23+ DEFAULT_PDF_DPI = 72
24+
25+
26+ def extract_words_for_page (
27+ page : pdfplumber .page .Page ,
28+ x_tolerance = 1.5 ,
29+ y_tolerance = 2 ,
30+ keep_blank_chars = False ,
31+ use_text_flow = True ,
32+ horizontal_ltr = True ,
33+ vertical_ttb = True ,
34+ extra_attrs = None ,
35+ ) -> Layout :
36+ """The helper function used for extracting words from a pdfplumber page
37+ object.
38+
39+ Returns:
40+ Layout: a layout object representing all extracted pdf tokens on this page.
41+ """
42+ if extra_attrs is None :
43+ extra_attrs = ["fontname" , "size" ]
44+
45+ tokens = page .extract_words (
46+ x_tolerance = x_tolerance ,
47+ y_tolerance = y_tolerance ,
48+ keep_blank_chars = keep_blank_chars ,
49+ use_text_flow = use_text_flow ,
50+ horizontal_ltr = horizontal_ltr ,
51+ vertical_ttb = vertical_ttb ,
52+ extra_attrs = extra_attrs ,
53+ )
54+
55+ df = pd .DataFrame (tokens )
56+ df [["x0" , "x1" ]] = (
57+ df [["x0" , "x1" ]].clip (lower = 0 , upper = int (page .width )).astype ("float" )
58+ )
59+ df [["top" , "bottom" ]] = (
60+ df [["top" , "bottom" ]].clip (lower = 0 , upper = int (page .height )).astype ("float" )
61+ )
62+
63+ page_tokens = load_dataframe (
64+ df .rename (
65+ columns = {
66+ "x0" : "x_1" ,
67+ "x1" : "x_2" ,
68+ "top" : "y_1" ,
69+ "bottom" : "y_2" ,
70+ "fontname" : "type" , # also loading fontname as "type"
71+ }
72+ ),
73+ block_type = "rectangle" ,
74+ )
75+
76+ return page_tokens
77+
78+
79+ def load_pdf (
80+ filename : str ,
81+ load_images : bool = False ,
82+ x_tolerance : int = 1.5 ,
83+ y_tolerance : int = 2 ,
84+ keep_blank_chars : bool = False ,
85+ use_text_flow : bool = True ,
86+ horizontal_ltr : bool = True ,
87+ vertical_ttb : bool = True ,
88+ extra_attrs : Optional [List [str ]] = None ,
89+ dpi : int = DEFAULT_PDF_DPI ,
90+ ) -> Union [List [Layout ], Tuple [List [Layout ], List ["Image.Image" ]]]:
91+ """Load all tokens for each page from a PDF file, and save them
92+ in a list of Layout objects with the original page order.
93+
94+ Args:
95+ filename (str): The path to the PDF file.
96+ load_images (bool, optional):
97+ Whether load screenshot for each page of the PDF file.
98+ When set to true, the function will return both the layout and
99+ screenshot image for each page.
100+ Defaults to False.
101+ x_tolerance (int, optional):
102+ The threshold used for extracting "word tokens" from the pdf file.
103+ It will merge the pdf characters into a word token if the difference
104+ between the x_2 of one character and the x_1 of the next is less than
105+ or equal to x_tolerance. See details in `pdf2plumber's documentation
106+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
107+ Defaults to 1.5.
108+ y_tolerance (int, optional):
109+ The threshold used for extracting "word tokens" from the pdf file.
110+ It will merge the pdf characters into a word token if the difference
111+ between the y_2 of one character and the y_1 of the next is less than
112+ or equal to y_tolerance. See details in `pdf2plumber's documentation
113+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
114+ Defaults to 2.
115+ keep_blank_chars (bool, optional):
116+ When keep_blank_chars is set to True, it will treat blank characters
117+ are treated as part of a word, not as a space between words. See
118+ details in `pdf2plumber's documentation
119+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
120+ Defaults to False.
121+ use_text_flow (bool, optional):
122+ When use_text_flow is set to True, it will use the PDF's underlying
123+ flow of characters as a guide for ordering and segmenting the words,
124+ rather than presorting the characters by x/y position. (This mimics
125+ how dragging a cursor highlights text in a PDF; as with that, the
126+ order does not always appear to be logical.) See details in
127+ `pdf2plumber's documentation
128+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
129+ Defaults to True.
130+ horizontal_ltr (bool, optional):
131+ When horizontal_ltr is set to True, it means the doc should read
132+ text from left to right, vice versa.
133+ Defaults to True.
134+ vertical_ttb (bool, optional):
135+ When vertical_ttb is set to True, it means the doc should read
136+ text from top to bottom, vice versa.
137+ Defaults to True.
138+ extra_attrs (Optional[List[str]], optional):
139+ Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
140+ restrict each words to characters that share exactly the same
141+ value for each of those `attributes extracted by pdfplumber
142+ <https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
143+ and the resulting word dicts will indicate those attributes.
144+ See details in `pdf2plumber's documentation
145+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
146+ Defaults to `["fontname", "size"]`.
147+ dpi (int, optional):
148+ When loading images of the pdf, you can also specify the resolution
149+ (or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
150+ for rendering the images. Higher DPI values mean clearer images (also
151+ larger file sizes).
152+ Setting dpi will also automatically resizes the extracted pdf_layout
153+ to match the sizes of the images. Therefore, when visualizing the
154+ pdf_layouts, it can be rendered appropriately.
155+ Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
156+ from the pdfplumber PDF parser.
157+
158+ Returns:
159+ List[Layout]:
160+ When `load_images=False`, it will only load the pdf_tokens from
161+ the PDF file. Each element of the list denotes all the tokens appeared
162+ on a single page, and the list is ordered the same as the original PDF
163+ page order.
164+ Tuple[List[Layout], List["Image.Image"]]:
165+ When `load_images=True`, besides the `all_page_layout`, it will also
166+ return a list of page images.
167+
168+ Examples::
169+ >>> import layoutparser as lp
170+ >>> pdf_layout = lp.load_pdf("path/to/pdf")
171+ >>> pdf_layout[0] # the layout for page 0
172+ >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
173+ >>> lp.draw_box(pdf_images[0], pdf_layout[0])
174+ """
175+
176+ plumber_pdf_object = pdfplumber .open (filename )
177+
178+ all_page_layout = []
179+ for page_id in range (len (plumber_pdf_object .pages )):
180+ cur_page = plumber_pdf_object .pages [page_id ]
181+
182+ page_tokens = extract_words_for_page (
183+ cur_page ,
184+ x_tolerance = x_tolerance ,
185+ y_tolerance = y_tolerance ,
186+ keep_blank_chars = keep_blank_chars ,
187+ use_text_flow = use_text_flow ,
188+ horizontal_ltr = horizontal_ltr ,
189+ vertical_ttb = vertical_ttb ,
190+ extra_attrs = extra_attrs ,
191+ )
192+
193+ # Adding metadata for the current page
194+ page_tokens .page_data ["width" ] = float (cur_page .width )
195+ page_tokens .page_data ["height" ] = float (cur_page .height )
196+ page_tokens .page_data ["index" ] = page_id
197+
198+ all_page_layout .append (page_tokens )
199+
200+ if not load_images :
201+ return all_page_layout
202+ else :
203+ import pdf2image
204+
205+ pdf_images = pdf2image .convert_from_path (filename , dpi = dpi )
206+
207+ for page_id , page_image in enumerate (pdf_images ):
208+ image_width , image_height = page_image .size
209+ page_layout = all_page_layout [page_id ]
210+ layout_width = page_layout .page_data ["width" ]
211+ layout_height = page_layout .page_data ["height" ]
212+ if image_width != layout_width or image_height != layout_height :
213+ scale_x = image_width / layout_width
214+ scale_y = image_height / layout_height
215+ page_layout = page_layout .scale ((scale_x , scale_y ))
216+ page_layout .page_data ["width" ] = image_width
217+ page_layout .page_data ["height" ] = image_height
218+ all_page_layout [page_id ] = page_layout
219+
220+ return all_page_layout , pdf_images
0 commit comments