Skip to content

Commit 9404fb5

Browse files
authored
feat: FileDatasetBuilder (#70)
* Add README for Docling-DPBench Signed-off-by: Christoph Auer <[email protected]> * Add FileDatasetBuilder Signed-off-by: Christoph Auer <[email protected]> * Add test and fixes Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]>
1 parent e3debd6 commit 9404fb5

File tree

5 files changed

+171
-0
lines changed

5 files changed

+171
-0
lines changed

docling_eval/dataset_builders/dataset_builder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,10 @@ def save_to_disk(
263263
test_dir = self.target / self.split
264264
test_dir.mkdir(parents=True, exist_ok=True)
265265

266+
if do_visualization:
267+
viz_path = self.target / "visualizations"
268+
viz_path.mkdir(exist_ok=True)
269+
266270
count = 0
267271
chunk_count = 0
268272

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import logging
2+
import mimetypes
3+
from io import BytesIO
4+
from pathlib import Path
5+
from typing import Iterable, List
6+
7+
from docling_core.types import DoclingDocument
8+
from docling_core.types.doc import ImageRef, PageItem, Size
9+
from docling_core.types.io import DocumentStream
10+
from PIL import Image
11+
from tqdm import tqdm
12+
13+
from docling_eval.datamodels.dataset_record import DatasetRecord
14+
from docling_eval.datamodels.types import BenchMarkColumns
15+
from docling_eval.dataset_builders.dataset_builder import BaseEvaluationDatasetBuilder
16+
from docling_eval.utils.utils import (
17+
add_pages_to_true_doc,
18+
extract_images,
19+
from_pil_to_base64uri,
20+
get_binary,
21+
get_binhash,
22+
)
23+
24+
# Get logger
25+
_log = logging.getLogger(__name__)
26+
27+
28+
class FileDatasetBuilder(BaseEvaluationDatasetBuilder):
29+
"""
30+
File dataset builder implementing the base dataset builder interface.
31+
32+
This builder processes a folder of PDFs or image files and creates a plain
33+
ground-truth dataset without annotations.
34+
"""
35+
36+
def __init__(
37+
self,
38+
dataset_source: Path,
39+
target: Path,
40+
split: str = "test",
41+
begin_index: int = 0,
42+
end_index: int = -1,
43+
file_extensions: List[str] = [
44+
"pdf",
45+
"tif",
46+
"tiff",
47+
"jpg",
48+
"jpeg",
49+
"png",
50+
"bmp",
51+
"gif",
52+
],
53+
):
54+
"""
55+
Initialize the File dataset builder.
56+
57+
Args:
58+
dataset_source: Folder where data files reside
59+
target: Path where processed dataset will be saved
60+
split: Dataset split to use
61+
begin_index: Start index for processing (inclusive)
62+
end_index: End index for processing (exclusive), -1 means process all
63+
"""
64+
super().__init__(
65+
name="FileDataset",
66+
dataset_source=dataset_source, # Local Path to dataset
67+
target=target,
68+
split=split,
69+
begin_index=begin_index,
70+
end_index=end_index,
71+
)
72+
self.file_extensions = file_extensions
73+
self.must_retrieve = False
74+
75+
def iterate(self) -> Iterable[DatasetRecord]:
76+
"""
77+
Iterate through the dataset and yield DatasetRecord objects.
78+
79+
Yields:
80+
DatasetRecord objects
81+
"""
82+
83+
assert isinstance(self.dataset_source, Path)
84+
85+
files: List[Path] = []
86+
87+
for ext in self.file_extensions:
88+
files.extend(self.dataset_source.glob(f"*.{ext}"))
89+
files.extend(self.dataset_source.glob(f"*.{ext.upper()}"))
90+
files.sort()
91+
92+
# Apply index range
93+
begin, end = self.get_effective_indices(len(files))
94+
selected_filenames = files[begin:end]
95+
96+
# Log stats
97+
self.log_dataset_stats(len(files), len(selected_filenames))
98+
_log.info(f"Processing File dataset with {len(selected_filenames)} files")
99+
100+
for filename in tqdm(
101+
selected_filenames,
102+
desc="Processing files for DP-Bench",
103+
ncols=128,
104+
):
105+
mime_type, _ = mimetypes.guess_type(filename)
106+
107+
# Create the ground truth Document
108+
true_doc = DoclingDocument(name=f"{filename}")
109+
if mime_type == "application/pdf":
110+
true_doc, _ = add_pages_to_true_doc(
111+
pdf_path=filename, true_doc=true_doc, image_scale=2.0
112+
)
113+
elif mime_type is not None and mime_type.startswith("image/"):
114+
image: Image.Image = Image.open(filename)
115+
image = image.convert("RGB")
116+
image_ref = ImageRef(
117+
mimetype="image/png",
118+
dpi=72,
119+
size=Size(width=image.width, height=image.height),
120+
uri=from_pil_to_base64uri(image),
121+
)
122+
page_item = PageItem(
123+
page_no=1,
124+
size=Size(width=float(image.width), height=float(image.height)),
125+
image=image_ref,
126+
)
127+
128+
true_doc.pages[1] = page_item
129+
else:
130+
raise ValueError(
131+
f"{filename} was not recognized as a supported type, aborting."
132+
)
133+
134+
# Extract images from the ground truth document
135+
true_doc, true_pictures, true_page_images = extract_images(
136+
document=true_doc,
137+
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
138+
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
139+
)
140+
141+
# Get PDF as binary data
142+
pdf_bytes = get_binary(filename)
143+
pdf_stream = DocumentStream(name=filename.name, stream=BytesIO(pdf_bytes))
144+
145+
# Create dataset record
146+
record = DatasetRecord(
147+
doc_id=str(filename.name),
148+
doc_hash=get_binhash(pdf_bytes),
149+
ground_truth_doc=true_doc,
150+
ground_truth_pictures=true_pictures,
151+
ground_truth_page_images=true_page_images,
152+
original=pdf_stream,
153+
mime_type=mime_type,
154+
)
155+
156+
yield record
301 KB
Loading
208 KB
Binary file not shown.

tests/test_dataset_builder.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from docling_eval.dataset_builders.doclaynet_v2_builder import DocLayNetV2DatasetBuilder
1919
from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder
2020
from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder
21+
from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder
2122
from docling_eval.dataset_builders.funsd_builder import FUNSDDatasetBuilder
2223
from docling_eval.dataset_builders.omnidocbench_builder import (
2324
OmniDocBenchDatasetBuilder,
@@ -579,3 +580,13 @@ def test_run_pixparse_builder():
579580
target_dataset_dir=target_path / "eval_dataset_e2e",
580581
end_index=5,
581582
)
583+
584+
585+
def test_file_dataset_builder():
586+
target_path = Path(f"./scratch/file_dataset/")
587+
588+
dataset_builder = FileDatasetBuilder(
589+
dataset_source=Path("./tests/data/files"), target=target_path
590+
)
591+
592+
dataset_builder.save_to_disk(do_visualization=True)

0 commit comments

Comments
 (0)