docling-project
diff --git a/‎docling_eval/cli/main.py‎
Lines changed: 7 additions & 0 deletions b/‎docling_eval/cli/main.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docling_eval/datamodels/dataset_record.py‎
Lines changed: 101 additions & 47 deletions b/‎docling_eval/datamodels/dataset_record.py‎
Lines changed: 101 additions & 47 deletions
diff --git a/‎docling_eval/dataset_builders/cvat_preannotation_builder.py‎
Lines changed: 5 additions & 1 deletion b/‎docling_eval/dataset_builders/cvat_preannotation_builder.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docling_eval/dataset_builders/dataset_builder.py‎
Lines changed: 1 addition & 1 deletion b/‎docling_eval/dataset_builders/dataset_builder.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling_eval/prediction_providers/base_prediction_provider.py‎
Lines changed: 1 addition & 1 deletion b/‎docling_eval/prediction_providers/base_prediction_provider.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling_eval/utils/utils.py‎
Lines changed: 70 additions & 19 deletions b/‎docling_eval/utils/utils.py‎
Lines changed: 70 additions & 19 deletions
@@ -910,13 +910,20 @@ def create_cvat(
     gt_dir: Annotated[Path, typer.Option(help="Dataset source path")],
     bucket_size: Annotated[int, typer.Option(help="Size of CVAT tasks")] = 20,
     use_predictions: Annotated[bool, typer.Option(help="use predictions")] = False,
+    sliding_window: Annotated[
+        int,
+        typer.Option(
+            help="Size of sliding window for page processing (1 for single pages, >1 for multi-page windows)"
+        ),
+    ] = 2,
 ):
     """Create dataset ready to upload to CVAT starting from (ground-truth) dataset."""
     builder = CvatPreannotationBuilder(
         dataset_source=gt_dir,
         target=output_dir,
         bucket_size=bucket_size,
         use_predictions=use_predictions,
+        sliding_window=sliding_window,
     )
     builder.prepare_for_annotation()
 
 
@@ -1,4 +1,5 @@
 import json
+from enum import Enum
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Optional, Union
@@ -19,6 +20,61 @@
 seg_adapter = TypeAdapter(Dict[int, SegmentedPage])
 
 
+class FieldType(Enum):
+    STRING = "string"
+    BINARY = "binary"
+    IMAGE_LIST = "image_list"
+    STRING_LIST = "string_list"
+
+
+class SchemaGenerator:
+    """Generates both HuggingFace Features and PyArrow schemas from a field definition."""
+
+    @staticmethod
+    def _get_features_type(field_type: FieldType):
+        mapping = {
+            FieldType.STRING: Value("string"),
+            FieldType.BINARY: Value("binary"),
+            FieldType.IMAGE_LIST: Sequence(Features_Image()),
+            FieldType.STRING_LIST: Sequence(Value("string")),
+        }
+        return mapping[field_type]
+
+    @staticmethod
+    def _get_pyarrow_type(field_type: FieldType):
+        import pyarrow as pa
+
+        image_type = pa.struct([("bytes", pa.binary()), ("path", pa.string())])
+
+        mapping = {
+            FieldType.STRING: pa.string(),
+            FieldType.BINARY: pa.binary(),
+            FieldType.IMAGE_LIST: pa.list_(image_type),
+            FieldType.STRING_LIST: pa.list_(pa.string()),
+        }
+        return mapping[field_type]
+
+    @classmethod
+    def generate_features(cls, field_definitions: Dict[str, FieldType]) -> Features:
+        return Features(
+            {
+                field_name: cls._get_features_type(field_type)
+                for field_name, field_type in field_definitions.items()
+            }
+        )
+
+    @classmethod
+    def generate_pyarrow_schema(cls, field_definitions: Dict[str, FieldType]):
+        import pyarrow as pa
+
+        return pa.schema(
+            [
+                (field_name, cls._get_pyarrow_type(field_type))
+                for field_name, field_type in field_definitions.items()
+            ]
+        )
+
+
 class DatasetRecord(
     BaseModel
 ):  # TODO make predictionrecord class, factor prediction-related fields there.
@@ -51,26 +107,30 @@ class DatasetRecord(
     def get_field_alias(cls, field_name: str) -> str:
         return cls.model_fields[field_name].alias or field_name
 
+    @classmethod
+    def _get_field_definitions(cls) -> Dict[str, FieldType]:
+        """Define the schema for this class. Override in subclasses to extend."""
+        return {
+            cls.get_field_alias("doc_id"): FieldType.STRING,
+            cls.get_field_alias("doc_path"): FieldType.STRING,
+            cls.get_field_alias("doc_hash"): FieldType.STRING,
+            cls.get_field_alias("ground_truth_doc"): FieldType.STRING,
+            cls.get_field_alias("ground_truth_segmented_pages"): FieldType.STRING,
+            cls.get_field_alias("ground_truth_pictures"): FieldType.IMAGE_LIST,
+            cls.get_field_alias("ground_truth_page_images"): FieldType.IMAGE_LIST,
+            cls.get_field_alias("original"): FieldType.BINARY,
+            cls.get_field_alias("mime_type"): FieldType.STRING,
+            cls.get_field_alias("modalities"): FieldType.STRING_LIST,
+        }
+
     @classmethod
     def features(cls):
-        return Features(
-            {
-                cls.get_field_alias("doc_id"): Value("string"),
-                cls.get_field_alias("doc_path"): Value("string"),
-                cls.get_field_alias("doc_hash"): Value("string"),
-                cls.get_field_alias("ground_truth_doc"): Value("string"),
-                cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
-                cls.get_field_alias("ground_truth_pictures"): Sequence(
-                    Features_Image()
-                ),
-                cls.get_field_alias("ground_truth_page_images"): Sequence(
-                    Features_Image()
-                ),
-                cls.get_field_alias("original"): Value("binary"),
-                cls.get_field_alias("mime_type"): Value("string"),
-                cls.get_field_alias("modalities"): Sequence(Value("string")),
-            }
-        )
+        return SchemaGenerator.generate_features(cls._get_field_definitions())
+
+    @classmethod
+    def pyarrow_schema(cls):
+        """Generate PyArrow schema that matches the HuggingFace datasets image format."""
+        return SchemaGenerator.generate_pyarrow_schema(cls._get_field_definitions())
 
     def _extract_images(
         self,
@@ -207,37 +267,31 @@ class DatasetRecordWithPrediction(DatasetRecord):
 
     model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True)
 
+    @classmethod
+    def _get_field_definitions(cls) -> Dict[str, FieldType]:
+        """Extend the parent schema with prediction-specific fields."""
+        base_definitions = super()._get_field_definitions()
+        prediction_definitions = {
+            cls.get_field_alias("predictor_info"): FieldType.STRING,
+            cls.get_field_alias("status"): FieldType.STRING,
+            cls.get_field_alias("predicted_doc"): FieldType.STRING,
+            cls.get_field_alias("predicted_segmented_pages"): FieldType.STRING,
+            cls.get_field_alias("predicted_pictures"): FieldType.IMAGE_LIST,
+            cls.get_field_alias("predicted_page_images"): FieldType.IMAGE_LIST,
+            cls.get_field_alias("prediction_format"): FieldType.STRING,
+            cls.get_field_alias("prediction_timings"): FieldType.STRING,
+            cls.get_field_alias("original_prediction"): FieldType.STRING,
+        }
+        return {**base_definitions, **prediction_definitions}
+
     @classmethod
     def features(cls):
-        return Features(
-            {
-                cls.get_field_alias("doc_id"): Value("string"),
-                cls.get_field_alias("doc_path"): Value("string"),
-                cls.get_field_alias("doc_hash"): Value("string"),
-                cls.get_field_alias("ground_truth_doc"): Value("string"),
-                cls.get_field_alias("ground_truth_segmented_pages"): Value("string"),
-                cls.get_field_alias("ground_truth_pictures"): Sequence(
-                    Features_Image()
-                ),
-                cls.get_field_alias("ground_truth_page_images"): Sequence(
-                    Features_Image()
-                ),
-                cls.get_field_alias("original"): Value("binary"),
-                cls.get_field_alias("mime_type"): Value("string"),
-                cls.get_field_alias("modalities"): Sequence(Value("string")),
-                cls.get_field_alias("predictor_info"): Value("string"),
-                cls.get_field_alias("status"): Value("string"),
-                cls.get_field_alias("predicted_doc"): Value("string"),
-                cls.get_field_alias("predicted_segmented_pages"): Value("string"),
-                cls.get_field_alias("predicted_pictures"): Sequence(Features_Image()),
-                cls.get_field_alias("predicted_page_images"): Sequence(
-                    Features_Image()
-                ),
-                cls.get_field_alias("prediction_format"): Value("string"),
-                cls.get_field_alias("prediction_timings"): Value("string"),
-                cls.get_field_alias("original_prediction"): Value("string"),
-            }
-        )
+        return SchemaGenerator.generate_features(cls._get_field_definitions())
+
+    @classmethod
+    def pyarrow_schema(cls):
+        """Generate PyArrow schema that matches the HuggingFace datasets image format."""
+        return SchemaGenerator.generate_pyarrow_schema(cls._get_field_definitions())
 
     def as_record_dict(self):
         record = super().as_record_dict()
 
@@ -47,6 +47,7 @@ def __init__(
         target: Path,
         bucket_size: int = 200,
         use_predictions: bool = False,
+        sliding_window: int = 2,
     ):
         """
         Initialize the CvatPreannotationBuilder.
@@ -55,10 +56,13 @@ def __init__(
             dataset_source: Directory containing the source dataset
             target: Directory where CVAT preannotations will be saved
             bucket_size: Number of documents per bucket for CVAT tasks
+            use_predictions: Whether to use predictions instead of ground truth
+            sliding_window: Size of sliding window for page processing (1 for single pages, >1 for multi-page windows)
         """
         self.source_dir = dataset_source
         self.target_dir = target
         self.bucket_size = bucket_size
+        self.sliding_window = sliding_window
         self.benchmark_dirs = BenchMarkDirs()
         self.benchmark_dirs.set_up_directory_structure(
             source=dataset_source, target=target
@@ -799,7 +803,7 @@ def prepare_for_annotation(self) -> None:
         _log.info(f"Preparing dataset from {self.source_dir} for CVAT annotation")
         self._create_project_properties()
         self.overview = self._export_from_dataset()
-        self._create_preannotation_files(sliding_window=1)
+        self._create_preannotation_files(sliding_window=self.sliding_window)
         _log.info(f"CVAT annotation preparation complete in {self.target_dir}")
 
 
 
@@ -305,8 +305,8 @@ def save_to_disk(
             save_shard_to_disk(
                 items=record_list,
                 dataset_path=test_dir,
+                schema=DatasetRecord.pyarrow_schema(),
                 shard_id=chunk_count,
-                features=DatasetRecord.features(),
             )
             count += len(record_list)
             chunk_count += 1
 
@@ -398,8 +398,8 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:
             save_shard_to_disk(
                 items=record_chunk,
                 dataset_path=test_dir,
+                schema=DatasetRecordWithPrediction.pyarrow_schema(),
                 shard_id=chunk_count,
-                features=DatasetRecordWithPrediction.features(),
             )
             count += len(record_chunk)
             chunk_count += 1
 
@@ -7,14 +7,15 @@
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import pandas as pd
 import PIL.Image
 from bs4 import BeautifulSoup  # type: ignore
 from datasets import Dataset, Features, load_dataset
 from datasets.iterable_dataset import IterableDataset
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import InputDocument
 from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
@@ -78,12 +79,14 @@ def write_datasets_info(
         json.dump(dataset_infos, fw, indent=2)
 
 
-def get_input_document(file: Path | BytesIO) -> InputDocument:
+def get_input_document(
+    file: Path | BytesIO, backend_t: Type[Any] = DoclingParseV4DocumentBackend
+) -> InputDocument:
     return InputDocument(
         path_or_stream=file,
         format=InputFormat.PDF,  # type: ignore[arg-type]
         filename=file.name if isinstance(file, Path) else "foo",
-        backend=DoclingParseV4DocumentBackend,
+        backend=backend_t,
     )
 
 
@@ -97,7 +100,7 @@ def from_pil_to_base64uri(img: Image.Image) -> AnyUrl:
 def add_pages_to_true_doc(
     pdf_path: Path | BytesIO, true_doc: DoclingDocument, image_scale: float = 1.0
 ):
-    in_doc = get_input_document(pdf_path)
+    in_doc = get_input_document(pdf_path, backend_t=PyPdfiumDocumentBackend)
     assert in_doc.valid, "Input doc must be valid."
     # assert in_doc.page_count == 1, "doc must have one page."
 
@@ -106,7 +109,11 @@ def add_pages_to_true_doc(
 
     for page_no in range(0, in_doc.page_count):
         page = Page(page_no=page_no)
-        page._backend = in_doc._backend.load_page(page.page_no)  # type: ignore[attr-defined]
+        try:
+            page._backend = in_doc._backend.load_page(page.page_no)  # type: ignore[attr-defined]
+        except RuntimeError as e:
+            logging.warning(f"Failed to load page {page.page_no}: {e}")
+            page._backend = None
 
         if page._backend is not None and page._backend.is_valid():
             page.size = page._backend.get_size()
@@ -489,34 +496,78 @@ def insert_images(
     return document
 
 
+def _pil_to_bytes(img: PIL.Image.Image) -> bytes:
+    """Convert PIL image to PNG bytes efficiently."""
+    buffered = io.BytesIO()
+    img.save(buffered, format="PNG")
+    return buffered.getvalue()
+
+
 def save_shard_to_disk(
     items: List[Any],
     dataset_path: Path,
+    schema: Any,
     thread_id: int = 0,
     shard_id: int = 0,
-    features: Optional[Features] = None,
-    shard_format: str = "parquet",
 ) -> None:
-    """Save shard to disk."""
+    """Save shard to disk as parquet."""
     if not items:
         return
 
-    # Use features if provided to avoid schema inference
-    batch = Dataset.from_list(items, features=features)
-
-    output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.{shard_format}"
-    if shard_format == "json":
-        batch.to_json(output_file)
-    elif shard_format == "parquet":
-        batch.to_parquet(output_file)
-    else:
-        raise ValueError(f"Unsupported shard_format: {shard_format}")
+    # Write directly to parquet using pyarrow to avoid Dataset.from_list() overhead
+    _save_to_parquet_direct(items, dataset_path, thread_id, shard_id, schema)
 
-    logging.info(f"Saved shard {shard_id} to {output_file} with {len(items)} documents")
+    logging.info(
+        f"Saved shard {shard_id} to {dataset_path / f'shard_{thread_id:06}_{shard_id:06}.parquet'} with {len(items)} documents"
+    )
 
     shard_id += 1
 
 
+def _save_to_parquet_direct(
+    items: List[Any], dataset_path: Path, thread_id: int, shard_id: int, schema: Any
+) -> None:
+    """Save directly to parquet using pyarrow to avoid Dataset.from_list() overhead."""
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    # Import here to avoid circular import
+    from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
+
+    # Convert data to pyarrow table format
+    records = []
+    for item in items:
+        record = dict(item)
+
+        # Convert PIL images to bytes for direct Arrow storage
+        for field_name in [
+            DatasetRecordWithPrediction.get_field_alias("ground_truth_pictures"),
+            DatasetRecordWithPrediction.get_field_alias("ground_truth_page_images"),
+            DatasetRecordWithPrediction.get_field_alias("predicted_pictures"),
+            DatasetRecordWithPrediction.get_field_alias("predicted_page_images"),
+        ]:
+            if field_name in record:
+                images = record[field_name]
+                if (
+                    images
+                    and len(images) > 0
+                    and isinstance(images[0], PIL.Image.Image)
+                ):
+                    # Convert to the same format as HuggingFace datasets expects
+                    record[field_name] = [
+                        {"bytes": _pil_to_bytes(img), "path": None} for img in images
+                    ]
+
+        records.append(record)
+
+    # Create pyarrow table with mandatory explicit schema
+    table = pa.Table.from_pylist(records, schema=schema)
+
+    # Write to parquet
+    output_file = dataset_path / f"shard_{thread_id:06}_{shard_id:06}.parquet"
+    pq.write_table(table, output_file)
+
+
 def dataset_exists(
     ds_path: Path,
     split: str,