Skip to content

Commit a157d5a

Browse files
fix: fix cropbox if it is larger than mediabox (#126)
Signed-off-by: Peter Staar <[email protected]>
1 parent d79c769 commit a157d5a

File tree

2 files changed

+28
-0
lines changed

2 files changed

+28
-0
lines changed

docling_parse/visualize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,11 @@ def visualise_py(
144144

145145
pdf_page: SegmentedPdfPage = pdf_doc.get_page(page_no=page_no)
146146

147+
if os.path.exists(str(output_dir)):
148+
pdf_page.save_as_json(
149+
Path(f"{output_dir}/{os.path.basename(pdf_path)}.page_{page_no}.json")
150+
)
151+
147152
if category in ["all", "char"]:
148153

149154
img = pdf_page.render_as_image(

src/v2/pdf_resources/page_dimension.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,29 @@ namespace pdflib
246246
crop_bbox = media_bbox;
247247
}
248248

249+
if(crop_bbox[0]<media_bbox[0] or
250+
crop_bbox[2]>media_bbox[2] or
251+
crop_bbox[1]<media_bbox[1] or
252+
crop_bbox[3]>media_bbox[3])
253+
{
254+
LOG_S(ERROR) << "The crop-box is larger than the media-box, \n"
255+
<< "crop-box: {"
256+
<< crop_bbox[0] << ", "
257+
<< crop_bbox[1] << ", "
258+
<< crop_bbox[2] << ", "
259+
<< crop_bbox[3] << "}\n"
260+
<< "media-box: {"
261+
<< media_bbox[0] << ", "
262+
<< media_bbox[1] << ", "
263+
<< media_bbox[2] << ", "
264+
<< media_bbox[3] << "}\n";
265+
266+
crop_bbox[0] = std::max(crop_bbox[0], media_bbox[0]);
267+
crop_bbox[1] = std::max(crop_bbox[1], media_bbox[1]);
268+
crop_bbox[2] = std::min(crop_bbox[2], media_bbox[2]);
269+
crop_bbox[3] = std::min(crop_bbox[3], media_bbox[3]);
270+
}
271+
249272
if(json_resources.count("/BleedBox"))
250273
{
251274
for(int d=0; d<4; d++)

0 commit comments

Comments
 (0)