From b8d351577fb715b31d5b5408438aa2a251370249 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 15 Aug 2025 13:19:52 +0200 Subject: [PATCH 1/5] feat: Extend the DoclingEvalCOCOExporter to have the export_COCO() method that exports DoclingDocument column from parquet to a COCO dataset. It can operate either on the true_doc or on pred_doc. Extend the main CLI to support multiple operations. Signed-off-by: Nikos Livathinos --- docling_eval/utils/coco_exporter.py | 341 ++++++++++++++++++++++++---- 1 file changed, 301 insertions(+), 40 deletions(-) diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py index 007c9df1..a9bff893 100644 --- a/docling_eval/utils/coco_exporter.py +++ b/docling_eval/utils/coco_exporter.py @@ -19,7 +19,7 @@ # If the original COCO dataset does not call all categories (e.g. DLNv1), the mapping is ignored -DOCLING_LABELS_TO_COCO_CATEGORIES: Dict[DocItemLabel, str] = { +VALID_DOCLING_LABELS_TO_COCO_CATEGORIES: Dict[DocItemLabel, str] = { DocItemLabel.CAPTION: "Caption", DocItemLabel.FOOTNOTE: "Footnote", DocItemLabel.FORMULA: "Formula", @@ -105,6 +105,7 @@ def __init__(self, docling_eval_ds_path: Path): def export_COCO_and_predictions( self, + split: str, save_dir: Path, ): r""" @@ -115,12 +116,244 @@ def export_COCO_and_predictions( # TODO pass + def export_COCO( + self, + split: str, + save_dir: Path, + doc_label_to_valid_label_mapping: dict[DocItemLabel, DocItemLabel], + source_doc_column: str = "GT", + ): + r""" + Parameters + ---------- + save_dir: Location to save the exported COCO dataset + split: COCO split to be created: One of ['train', 'test', 'val'] + source_doc_column: Which column from the parquet file should be used to generate the COCO dataset. + It should be one of ["GT", "pred"]. By default "GT" + """ + # Build the info and licenses + info: dict = self._build_info() + licenses: list[dict] = self._build_licenses() + + # Generate mapping from document labels to category_id + label_to_category_id: dict[DocItemLabel, int] = { + label: cat_id + for cat_id, (label, category) in enumerate( + VALID_DOCLING_LABELS_TO_COCO_CATEGORIES.items() + ) + } + # Add the additional document labels which are not "valid" + for doc_label, valid_label in doc_label_to_valid_label_mapping.items(): + label_to_category_id[doc_label] = label_to_category_id[valid_label] + + # Build the categories + categories: list[dict] = self._build_categories() + + # Get the images dir + images_dir = save_dir / split + images_dir.mkdir(parents=True, exist_ok=True) + + # Build images and annotations + images: list[dict] = [] + anns: list[dict] = [] + ds = self._load_ds(split) + ds_selection = ds[split] + image_id = -1 + annotation_id = -1 + for i, data in enumerate(ds_selection): + data_record = DatasetRecordWithPrediction.model_validate(data) + doc_id = data_record.doc_id + if data_record.predicted_doc is not None and source_doc_column == "pred": + doc = data_record.predicted_doc + _log.info("Dataset document to export: 'predicted_doc'") + else: + doc = data_record.ground_truth_doc + _log.info("Dataset document to export: 'ground_truth_doc'") + + # Convert the doc in a COCO-dataset + doc_images: list[dict] + doc_anns: list[dict] + doc_images, doc_anns, image_id, annotation_id = ( + self._extract_layout_coco_annotations( + doc_id, + doc, + label_to_category_id, + images_dir, + image_id, + annotation_id, + ) + ) + images.extend(doc_images) + anns.extend(doc_anns) + + # Save the annotations + annotations: dict = { + "info": info, + "categories": categories, + "images": images, + "annotations": anns, + "licenses": licenses, + } + annotations_dir = save_dir / "annotations" + annotations_dir.mkdir(parents=True, exist_ok=True) + annotations_fn = annotations_dir / f"{split}2017.json" + _log.info("Saving the exported COCO annotations in: %s", str(annotations_fn)) + with open(annotations_fn, "w") as fd: + json.dump(annotations, fd) + + return annotations + + def _extract_layout_coco_annotations( + self, + doc_id: str, + doc: DoclingDocument, + labels_to_category_ids: Dict[DocItemLabel, int], + images_dir: Path, + image_id_offset: int, + annotation_id_offset: int, + ) -> Tuple[List[Dict], List[Dict], int, int]: + r""" + Returns + ------- + images: List of dict in COCO format with the images in the document + annotations: List of dict in COCO format with the annotations in the document + last_image_id: The last image id generated by that document + last_annotation_id: The last annotation_id generated by that document + """ + doc_images: list[dict] = [] # Images of the document + doc_anns: list[dict] = [] # Annotations of the document + + included_content_layers = {c for c in ContentLayer} + image_id = 1 + image_id_offset + annotation_id = 1 + annotation_id_offset + for item, _ in doc.iterate_items( + included_content_layers=included_content_layers + ): + if not isinstance(item, DocItem): + continue + label = item.label + category_id = labels_to_category_ids.get(label, -1) + + # Skip label without mapping into a COCO categories + if category_id == -1: + _log.error( + "Skip prediction with label that does not map to COCO categories: '%s'", + label, + ) + continue + + # TODO: Use only the first provenance of the item + if len(item.prov) == 0: + _log.error("Skip item without provenance: %s: %s", doc_id, label.value) + continue + + increase_image_id = False + prov = item.prov[0] + page_no = prov.page_no + page = doc.pages[page_no] + page_size = page.size + + # Save the page image + if page.image is not None and page_no > len(doc_images): + img: Image.Image = page.image.pil_image # type: ignore + if img: + assert ( + img.width == page_size.width and img.height == page_size.height + ) + + image_filename = ( + f"{doc_id}.png" + if "page" in doc_id + else f"{doc_id}_page_{page_no:06d}.png" + ) + image_fn = images_dir / image_filename + _log.info("Saving image: %s", str(image_fn)) + img.save(image_fn) + + doc_images.append( + { + "licence": 1, + "file_name": image_filename, + "height": img.height, + "width": img.width, + "id": image_id, + } + ) + increase_image_id = True + + # Get the bbox in [x,y,w,h] COCO format + bbox: BoundingBox = prov.bbox.to_top_left_origin( + page_height=page_size.height + ) + doc_anns.append( + { + "image_id": image_id, + "category_id": category_id, + "bbox": [bbox.l, bbox.t, bbox.width, bbox.height], + "iscrowed": 0, + "area": bbox.area(), + "id": annotation_id, + } + ) + + # Update the ids + annotation_id += 1 + if increase_image_id: + image_id += 1 + + return doc_images, doc_anns, image_id, annotation_id + + def _build_licenses(self) -> list[dict]: + r""" """ + license = { + "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/", + "id": 1, + "name": "Attribution-NonCommercial-ShareAlike License", + } + return [license] + + def _load_ds(self, split: str) -> Dataset: + r"""Load the dataset from the parquet files""" + split_path = str(self._docling_eval_ds_path / split / "*.parquet") + split_files = glob.glob(split_path) + ds = load_dataset("parquet", data_files={split: split_files}) + return ds + + def _build_info(self): + r""" """ + info = { + "description": "COCO 2017 Dataset", + "url": "http://cocodataset.org", + "version": "1.0", + "year": 2017, + "contributor": "COCO Consortium", + "date_created": "2017/09/01", + } + return info + + def _build_categories( + self, + supercategory: str = "DoclingDocument", + ) -> list[dict]: + r""" """ + categories: list[dict] = [] + for cat_id, (label, category_name) in enumerate( + VALID_DOCLING_LABELS_TO_COCO_CATEGORIES.items() + ): + categories.append( + { + "supercategory": supercategory, + "id": cat_id, + "name": category_name, + } + ) + return categories + def export_predictions_wrt_original_COCO( self, split: str, save_dir: Path, original_coco_dir: Path, - labels_to_categories: Dict[DocItemLabel, str], ) -> List[Dict]: r""" Export the predictions as a json file in pycocotools format: @@ -169,14 +402,12 @@ def export_predictions_wrt_original_COCO( } labels_to_category_ids: Dict[DocItemLabel, int] = { label: category_to_id[category] - for label, category in labels_to_categories.items() + for label, category in VALID_DOCLING_LABELS_TO_COCO_CATEGORIES.items() if category in category_to_id } # Load the HF dataset - split_path = str(self._docling_eval_ds_path / split / "*.parquet") - split_files = glob.glob(split_path) - ds = load_dataset("parquet", data_files={split: split_files}) + ds = self._load_ds(split) ds_selection: Dataset = ds[split] # Debug @@ -212,7 +443,7 @@ def export_predictions_wrt_original_COCO( coco_img_height = im.height # Extract labels, bboxes, scores - category_ids, scores, bboxes = self._extract_layout_data( + category_ids, scores, bboxes = self._extract_layout_predictions( doc_id, pred_doc, coco_img_width, @@ -240,7 +471,7 @@ def export_predictions_wrt_original_COCO( json.dump(predictions, fd) return predictions - def _extract_layout_data( + def _extract_layout_predictions( self, doc_id: str, pred_doc: DoclingDocument, @@ -296,47 +527,77 @@ def _extract_layout_data( return category_ids, scores, bboxes -def main(args): +def main(): r""" """ - # Get args - docling_eval_path = Path(args.docling_eval_dir) - coco_path = Path(args.coco_dir) - save_path = Path(args.save_dir) + # Input parameters + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", + "--operation", + required=True, + type=str, + help="Operation to perform. One of ['coco']", + ) + parser.add_argument( + "-s", + "--save_dir", + required=True, + type=Path, + help="Output directory to save files", + ) + parser.add_argument( + "-d", + "--docling_eval_dir", + required=True, + type=Path, + help="Root dir with the docling-eval parquet dataset with the predictions", + ) + parser.add_argument( + "-c", + "--coco_dir", + required=False, + type=Path, + help="Root dir of the COCO dataset", + ) + args = parser.parse_args() # Setup logger logging.getLogger("docling").setLevel(logging.WARNING) log_format = "%(asctime)s - %(levelname)s - %(message)s" logging.basicConfig(level=logging.INFO, format=log_format) - _log.info("Export eval-dataset in COCO-tools format") - _log.info("COCO dataset: %s", str(coco_path)) - _log.info("eval-dataset: %s", str(docling_eval_path)) - _log.info("Save path: %s", str(save_path)) + _log.info("Operation: %s", args.operation) + _log.info("eval-dataset: %s", str(args.docling_eval_dir)) + _log.info("Save path: %s", str(args.save_dir)) # Create the COCO exporter - exporter = DoclingEvalCOCOExporter(docling_eval_path) - exporter.export_predictions_wrt_original_COCO( - "test", - save_path, - coco_path, - DOCLING_LABELS_TO_COCO_CATEGORIES, - ) + exporter = DoclingEvalCOCOExporter(args.docling_eval_dir) + + # Run the operation + if args.operation.upper() == "COCO": + # Mapping from the parquet document label to the valid docling labels + doc_label_to_valid_label_mapping: dict[DocItemLabel, DocItemLabel] = { + DocItemLabel.PAGE_FOOTER: DocItemLabel.TEXT, + DocItemLabel.PAGE_HEADER: DocItemLabel.TEXT, + DocItemLabel.HANDWRITTEN_TEXT: DocItemLabel.PICTURE, + DocItemLabel.EMPTY_VALUE: DocItemLabel.KEY_VALUE_REGION, + DocItemLabel.PARAGRAPH: DocItemLabel.TEXT, + DocItemLabel.REFERENCE: DocItemLabel.TEXT, + } + exporter.export_COCO( + "test", + args.save_dir, + doc_label_to_valid_label_mapping, + ) + elif args.operation.upper() == "predictions": + exporter.export_predictions_wrt_original_COCO( + "test", + args.save_dir, + args.coco_dir, + ) + else: + raise ValueError(f"Not supported operation: {args.operation}") if __name__ == "__main__": - # Input parameters - parser = argparse.ArgumentParser() - parser.add_argument( - "-d", - "--docling_eval_dir", - required=False, - help="Root dir with the docling-eval parquet dataset with the predictions", - ) - parser.add_argument( - "-c", "--coco_dir", required=False, help="Root dir of the COCO dataset" - ) - parser.add_argument( - "-s", "--save_dir", required=True, help="Output directory to save files" - ) - args = parser.parse_args() - main(args) + main() From e65aa3b6fcbcc29d540df926b92b1914e83a60de Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 15 Aug 2025 14:04:05 +0200 Subject: [PATCH 2/5] fix: Minor fix in COCO exporter Signed-off-by: Nikos Livathinos --- docling_eval/utils/coco_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py index a9bff893..b239906b 100644 --- a/docling_eval/utils/coco_exporter.py +++ b/docling_eval/utils/coco_exporter.py @@ -290,7 +290,7 @@ def _extract_layout_coco_annotations( "image_id": image_id, "category_id": category_id, "bbox": [bbox.l, bbox.t, bbox.width, bbox.height], - "iscrowed": 0, + "iscrowd": 0, "area": bbox.area(), "id": annotation_id, } From e63ab7316d9987c4ae66b99f7430404bbb81ce42 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 15 Aug 2025 14:47:16 +0200 Subject: [PATCH 3/5] fix: Fix handling of the annotation_id, image_id in COCO exporter Signed-off-by: Nikos Livathinos --- docling_eval/utils/coco_exporter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py index b239906b..62ef517e 100644 --- a/docling_eval/utils/coco_exporter.py +++ b/docling_eval/utils/coco_exporter.py @@ -158,8 +158,8 @@ def export_COCO( anns: list[dict] = [] ds = self._load_ds(split) ds_selection = ds[split] - image_id = -1 - annotation_id = -1 + image_id = 0 + annotation_id = 0 for i, data in enumerate(ds_selection): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id @@ -224,8 +224,8 @@ def _extract_layout_coco_annotations( doc_anns: list[dict] = [] # Annotations of the document included_content_layers = {c for c in ContentLayer} - image_id = 1 + image_id_offset - annotation_id = 1 + annotation_id_offset + image_id = image_id_offset + annotation_id = annotation_id_offset for item, _ in doc.iterate_items( included_content_layers=included_content_layers ): From 5b23e3a60c403c213ef0eb032b461d9b9730d949 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 15 Aug 2025 16:42:10 +0200 Subject: [PATCH 4/5] fix: Fix mistake in coco_exporter Signed-off-by: Nikos Livathinos --- docling_eval/utils/coco_exporter.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py index 62ef517e..33e45f8b 100644 --- a/docling_eval/utils/coco_exporter.py +++ b/docling_eval/utils/coco_exporter.py @@ -163,6 +163,7 @@ def export_COCO( for i, data in enumerate(ds_selection): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id + if data_record.predicted_doc is not None and source_doc_column == "pred": doc = data_record.predicted_doc _log.info("Dataset document to export: 'predicted_doc'") @@ -247,7 +248,6 @@ def _extract_layout_coco_annotations( _log.error("Skip item without provenance: %s: %s", doc_id, label.value) continue - increase_image_id = False prov = item.prov[0] page_no = prov.page_no page = doc.pages[page_no] @@ -279,15 +279,14 @@ def _extract_layout_coco_annotations( "id": image_id, } ) - increase_image_id = True + image_id += 1 # Get the bbox in [x,y,w,h] COCO format - bbox: BoundingBox = prov.bbox.to_top_left_origin( - page_height=page_size.height - ) + bbox: BoundingBox = prov.bbox + bbox = bbox.to_top_left_origin(page_height=page_size.height) doc_anns.append( { - "image_id": image_id, + "image_id": image_id - 1, "category_id": category_id, "bbox": [bbox.l, bbox.t, bbox.width, bbox.height], "iscrowd": 0, @@ -295,11 +294,7 @@ def _extract_layout_coco_annotations( "id": annotation_id, } ) - - # Update the ids annotation_id += 1 - if increase_image_id: - image_id += 1 return doc_images, doc_anns, image_id, annotation_id From a3083e638313bc8ddae8f0908da58926ad15c3c9 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 15 Aug 2025 17:17:40 +0200 Subject: [PATCH 5/5] fix: Fix label mapping in COCO exporter Signed-off-by: Nikos Livathinos --- docling_eval/utils/coco_exporter.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py index 33e45f8b..53b16373 100644 --- a/docling_eval/utils/coco_exporter.py +++ b/docling_eval/utils/coco_exporter.py @@ -3,7 +3,7 @@ import json import logging from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple from datasets import Dataset, load_dataset from docling_core.types.doc.base import BoundingBox, Size @@ -120,7 +120,9 @@ def export_COCO( self, split: str, save_dir: Path, - doc_label_to_valid_label_mapping: dict[DocItemLabel, DocItemLabel], + extra_doc_label_to_valid_label_mapping: dict[ + DocItemLabel, Optional[DocItemLabel] + ], source_doc_column: str = "GT", ): r""" @@ -128,6 +130,8 @@ def export_COCO( ---------- save_dir: Location to save the exported COCO dataset split: COCO split to be created: One of ['train', 'test', 'val'] + doc_label_to_valid_label_mapping: Exta mappings from docling document to valid docling labels. + If a mapping value is None, it means to ignore this key. source_doc_column: Which column from the parquet file should be used to generate the COCO dataset. It should be one of ["GT", "pred"]. By default "GT" """ @@ -142,9 +146,12 @@ def export_COCO( VALID_DOCLING_LABELS_TO_COCO_CATEGORIES.items() ) } - # Add the additional document labels which are not "valid" - for doc_label, valid_label in doc_label_to_valid_label_mapping.items(): - label_to_category_id[doc_label] = label_to_category_id[valid_label] + # Apply the corrections given in the doc_label_to_valid_label_mapping + for doc_label, valid_label in extra_doc_label_to_valid_label_mapping.items(): + if valid_label is not None: + label_to_category_id[doc_label] = label_to_category_id[valid_label] + elif doc_label in label_to_category_id: + del label_to_category_id[doc_label] # Build the categories categories: list[dict] = self._build_categories() @@ -237,13 +244,13 @@ def _extract_layout_coco_annotations( # Skip label without mapping into a COCO categories if category_id == -1: - _log.error( + _log.warning( "Skip prediction with label that does not map to COCO categories: '%s'", label, ) continue - # TODO: Use only the first provenance of the item + # Use only the first provenance of the item if len(item.prov) == 0: _log.error("Skip item without provenance: %s: %s", doc_id, label.value) continue @@ -575,7 +582,8 @@ def main(): DocItemLabel.PAGE_FOOTER: DocItemLabel.TEXT, DocItemLabel.PAGE_HEADER: DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT: DocItemLabel.PICTURE, - DocItemLabel.EMPTY_VALUE: DocItemLabel.KEY_VALUE_REGION, + DocItemLabel.EMPTY_VALUE: None, + DocItemLabel.KEY_VALUE_REGION: None, DocItemLabel.PARAGRAPH: DocItemLabel.TEXT, DocItemLabel.REFERENCE: DocItemLabel.TEXT, }