Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 63 additions & 12 deletions deepdoctection/dataflow/custom_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
"""

import itertools
import io
import json
import math
import os
from collections import defaultdict
from pypdf import PdfWriter, PdfReader
from typing import DefaultDict, Dict, List, Optional, Sequence, Union

from jsonlines import Reader, Writer
Expand All @@ -40,13 +43,21 @@
__all__ = ["SerializerJsonlines", "SerializerFiles", "SerializerCoco", "SerializerPdfDoc", "SerializerTabsepFiles"]


def _reset_df_and_get_length(df: DataFlow) -> int:
df.reset_state()
try:
length = len(df)
except NotImplementedError:
length = 0
return length
def _chunk_splits(pages_in_pdf: int, chunk_size: int, last_pdf: Optional[bool] = True) -> Dict[int, List[int]]:
lastInsertedPage = 1
pages_dict = dict()

for i in range(math.floor(pages_in_pdf / chunk_size)):
breakPdfPagesIndices = [lastInsertedPage, lastInsertedPage + chunk_size]
lastInsertedPage += chunk_size
pages_dict[i] = breakPdfPagesIndices

if last_pdf and pages_in_pdf % chunk_size:
if len(pages_dict) == 0:
pages_dict[0] = [lastInsertedPage, pages_in_pdf + 1]
else:
pages_dict[list(pages_dict)[-1] + 1] = [lastInsertedPage, pages_in_pdf + 1]
return pages_dict


class SerializerJsonlines:
Expand Down Expand Up @@ -544,15 +555,55 @@ def save(path: Pathlike) -> None:
raise NotImplementedError

@staticmethod
def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
def split(
path: Pathlike,
path_target: Optional[Pathlike] = None,
max_datapoint: Optional[int] = None,
chunk_size: Optional[int] = None,
) -> None:
"""
Split a document into single pages.
Split a document into single pages or chunks.
Chunk Size is in pages and includes boths boundaries.
"""
if path_target is None:
path_target, _ = os.path.split(path)
if not os.path.isdir(path_target):
raise NotADirectoryError(path)

df = SerializerPdfDoc.load(path, max_datapoint)
for dp in df:
with open(os.path.join(path_target, dp["file_name"]), "wb") as page:
page.write(dp["pdf_bytes"])
new_pdf_index: int = 0
tmp = io.BytesIO()

if chunk_size is not None:
max_pdf_pages: int = 0
if max_datapoint is not None:
max_pdf_pages = len(df) if max_datapoint > len(df) else max_datapoint
else:
max_pdf_pages = len(df)

chunk_splits = _chunk_splits(max_pdf_pages, chunk_size)
new_pdf_files = [PdfWriter() for _ in chunk_splits]
output_filename: str = ""

for idx, dp in enumerate(df, 1):
tmp = io.BytesIO()
page = PdfReader(io.BytesIO(dp['pdf_bytes']))
if idx in list(range(*chunk_splits[new_pdf_index])):
new_pdf_files[new_pdf_index].add_page(page.pages[0])
else:
output_filename = os.path.join(path_target, dp['file_name'])
new_pdf_files[new_pdf_index].write(tmp)
tmp.seek(0)
with open(output_filename, 'wb') as pdf_out:
new_pdf_files[new_pdf_index].write(pdf_out)
new_pdf_index += 1
new_pdf_files[new_pdf_index].add_page(page.pages[0])
tmp = io.BytesIO()
new_pdf_files[new_pdf_index].write(tmp)
tmp.seek(0)
with open(output_filename, "wb") as pdf_out:
new_pdf_files[new_pdf_index].write(pdf_out)
else:
for dp in df:
with open(os.path.join(path_target, dp["file_name"]), "wb") as page:
page.write(dp["pdf_bytes"])
15 changes: 15 additions & 0 deletions tests/dataflow/test_custom_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,13 @@ class TestSerializerPdfDoc:
def test_loading() -> None:
"""
Test the loading of a .pdf file
PDF Samples: https://freetestdata.com/document-files/pdf/
"""

# Arrange
test_file = os.path.join(get_test_path(), "test_file.pdf")
long_test_file = os.path.join(get_test_path(), "260KB.pdf")
long_test_file_0 = os.path.join(get_test_path(), "260KB_10.pdf")

# Act
df = SerializerPdfDoc.load(test_file)
Expand All @@ -175,3 +178,15 @@ def test_loading() -> None:
assert first_image["path"] == test_file
assert first_image["file_name"] == "test_file_0.pdf"
assert isinstance(first_image["pdf_bytes"], bytes)

# Act
SerializerPdfDoc.split(long_test_file, chunk_size=10)
df = SerializerPdfDoc.load(long_test_file_0)
output = collect_datapoint_from_dataflow(df=df)
first_image = output[0]

# Assert
assert len(output) == 10
assert first_image["path"] == long_test_file_0
assert first_image["file_name"] == "260KB_10_0.pdf"
assert isinstance(first_image["pdf_bytes"], bytes)
Binary file added tests/test_objects/260KB.pdf
Binary file not shown.