diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index a611cdda..3972f3af 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -126,6 +126,7 @@ from docling_eval.prediction_providers.tableformer_provider import ( TableFormerPredictionProvider, ) +from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer class DoclingLayoutOptionsManager: @@ -362,7 +363,7 @@ def get_prediction_provider( docling_layout_keep_empty_clusters: Optional[bool] = None, # Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING) docling_programmatic_add_orphan_text_cells: Optional[bool] = None, - docling_force_full_page_ocr: Optional[bool] = None, + docling_force_full_page_ocr: bool = False, granite_docling_vlm_options: Optional[InlineVlmOptions] = None, max_new_tokens: Optional[int] = None, ): @@ -376,7 +377,7 @@ def get_prediction_provider( ocr_factory = get_ocr_factory() ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore - kind="easyocr", + kind="rapidocr", force_full_page_ocr=docling_force_full_page_ocr, ) # Use all CPU cores @@ -1578,6 +1579,67 @@ def visualize_cmd( ) +@app.command(name="create_viz") +def create_viz( + dataset_dir: Annotated[ + Path, + typer.Option( + help=( + "Dataset directory (GT parquet or eval_dataset parquet with predictions) " + "containing the split folder with parquet shards." + ) + ), + ], + split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test", + external_predictions_path: Annotated[ + Optional[Path], + typer.Option( + help=( + "Directory with DoclingDocument predictions named as .[json|dt|yaml|yml]. " + "If omitted, predictions are taken from the dataset parquet." + ) + ), + ] = None, + output_dir: Annotated[ + Optional[Path], + typer.Option( + help=( + "Directory where HTML visualizations are written. Defaults to " + "/visualizations when omitted." + ) + ), + ] = None, + begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0, + end_index: Annotated[ + int, typer.Option(help="End index (exclusive), -1 for all") + ] = -1, + ignore_missing_predictions: Annotated[ + bool, + typer.Option( + help="Skip documents without a matching prediction instead of failing" + ), + ] = False, +): + """ + Create paired GT vs. prediction HTML visualizations without generating parquet output. + """ + visualizations_dir = ( + output_dir if output_dir is not None else dataset_dir / "visualizations" + ) + + visualizer = PredictionsVisualizer( + visualizations_dir=visualizations_dir, + external_predictions_dir=external_predictions_path, + ignore_missing_predictions=ignore_missing_predictions, + ) + visualizer.create_visualizations( + dataset_dir=dataset_dir, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + @app.callback() def main(): """Docling Evaluation CLI for benchmarking document processing tasks.""" diff --git a/docling_eval/utils/external_predictions_visualizer.py b/docling_eval/utils/external_predictions_visualizer.py new file mode 100644 index 00000000..35342c98 --- /dev/null +++ b/docling_eval/utils/external_predictions_visualizer.py @@ -0,0 +1,163 @@ +import logging +from pathlib import Path +from typing import List, Optional, Tuple + +from datasets import Dataset, load_dataset +from docling.datamodel.base_models import ConversionStatus +from docling_core.types.doc.document import DoclingDocument +from PIL import Image +from tqdm import tqdm # type: ignore + +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) +from docling_eval.utils.utils import extract_images, insert_images_from_pil +from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters + +_LOGGER = logging.getLogger(__name__) + + +class PredictionsVisualizer: + """ + Render ground-truth vs. prediction visualizations for an existing dataset. + + Works with either: + - A dataset that already embeds predictions (DatasetRecordWithPrediction parquet) + - A ground-truth-only dataset paired with an external predictions directory + containing DoclingDocument files named .[json|dt|yaml|yml] + """ + + def __init__( + self, + visualizations_dir: Path, + *, + external_predictions_dir: Optional[Path] = None, + ignore_missing_predictions: bool = False, + ): + self._loader = ( + ExternalDoclingDocumentLoader(external_predictions_dir) + if external_predictions_dir is not None + else None + ) + self._visualizations_dir = visualizations_dir + self._ignore_missing_predictions = ignore_missing_predictions + + def create_visualizations( + self, + dataset_dir: Path, + split: str = "test", + begin_index: int = 0, + end_index: int = -1, + ) -> None: + """ + Generate paired HTML visualizations between ground truth and predictions. + """ + dataset = self._load_split(dataset_dir, split) + dataset = self._slice_dataset(dataset, begin_index, end_index) + self._visualizations_dir.mkdir(parents=True, exist_ok=True) + + for _, row in tqdm( + enumerate(dataset), + desc="Rendering visualizations", + total=len(dataset), + ncols=120, + ): + record = DatasetRecordWithPrediction.model_validate(row) + pred_doc = self._resolve_prediction_document(record) + if pred_doc is None: + message = f"Missing prediction for document {record.doc_id}" + if self._ignore_missing_predictions: + _LOGGER.warning(message) + continue + raise FileNotFoundError(message) + + pred_doc, pred_pictures, pred_page_images = self._prepare_prediction_assets( + record, pred_doc + ) + + record_for_viz = record.model_copy(deep=True) + record_for_viz.predicted_doc = pred_doc + record_for_viz.predicted_pictures = pred_pictures + record_for_viz.predicted_page_images = pred_page_images + record_for_viz.prediction_format = PredictionFormats.DOCLING_DOCUMENT + record_for_viz.status = ConversionStatus.SUCCESS + + self._save_visualization(record_for_viz) + + def _resolve_prediction_document( + self, record: DatasetRecordWithPrediction + ) -> Optional[DoclingDocument]: + if self._loader is not None: + return self._loader(record) + return record.predicted_doc + + def _prepare_prediction_assets( + self, record: DatasetRecordWithPrediction, pred_doc: DoclingDocument + ) -> Tuple[DoclingDocument, List[Image.Image], List[Image.Image]]: + if self._loader is None and ( + record.predicted_pictures or record.predicted_page_images + ): + return ( + pred_doc.model_copy(deep=True), + list(record.predicted_pictures), + list(record.predicted_page_images), + ) + + prepared_doc, pred_pictures, pred_page_images = extract_images( + document=pred_doc.model_copy(deep=True), + pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, + page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, + ) + return prepared_doc, pred_pictures, pred_page_images + + def _load_split(self, dataset_dir: Path, split: str) -> Dataset: + split_dir = dataset_dir / split + split_files = sorted(split_dir.glob("*.parquet")) + if not split_files: + raise FileNotFoundError(f"No parquet files found under {split_dir}") + dataset = load_dataset( + "parquet", data_files={split: [str(path) for path in split_files]} + ) + return dataset[split] + + def _slice_dataset( + self, dataset: Dataset, begin_index: int, end_index: int + ) -> Dataset: + total = len(dataset) + begin = max(begin_index, 0) + end = total if end_index < 0 else min(end_index, total) + + if begin >= end: + return dataset.select([]) + if begin == 0 and end == total: + return dataset + return dataset.select(range(begin, end)) + + def _save_visualization(self, record: DatasetRecordWithPrediction) -> None: + if record.predicted_doc is None: + return + + gt_doc = insert_images_from_pil( + record.ground_truth_doc.model_copy(deep=True), + record.ground_truth_pictures, + record.ground_truth_page_images, + ) + pred_doc = insert_images_from_pil( + record.predicted_doc.model_copy(deep=True), + record.predicted_pictures, + record.predicted_page_images, + ) + + try: + save_comparison_html_with_clusters( + filename=self._visualizations_dir / f"{record.doc_id}.html", + true_doc=gt_doc, + pred_doc=pred_doc, + draw_reading_order=True, + ) + except (IndexError, ValueError) as e: + _LOGGER.warning( + f"Failed to save visualization for doc_id {record.doc_id}: {e}" + ) diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh index 413b12de..6189e90b 100755 --- a/docs/examples/evaluate_dpbench_on_external_predictions.sh +++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh @@ -38,7 +38,7 @@ evaluate() { fi for modality in "${MODALITIES[@]}"; do - echo "Evaluation modality: ${modality}, predictions: ${pred_dir}" + echo "Evaluate: modality: ${modality}: predictions: ${pred_dir}" uv run docling-eval evaluate \ --benchmark DPBench \ --modality "${modality}" \ @@ -49,24 +49,54 @@ evaluate() { } +visualize() { + local pred_dir save_dir modality + pred_dir="$1" + save_dir="$2" + + # Check if the GT/preds dirs exist + if [ ! -d "${GT_DIR}" ]; then + echo "Missing GT dir: ${GT_DIR}" + exit 1 + fi + if [ ! -d "${pred_dir}" ]; then + echo "Missing predictions dir: ${pred_dir}" + exit 2 + fi + + echo "Visualize predictions: ${pred_dir}" + uv run docling-eval create_viz \ + --dataset-dir "${GT_DIR}" \ + --external-predictions-path "${pred_dir}" \ + --output-dir "${save_dir}" +} + ########################################################################################### # Main # +# Predictions + # json predictions evaluate \ scratch/DPBench/predicted_documents/json \ - scratch/DPBench/external_evaluations_jsons + scratch/DPBench/external_predictions_jsons # doctags predictions evaluate \ scratch/DPBench/predicted_documents/doctag \ - scratch/DPBench/external_evaluations_doctags + scratch/DPBench/external_predictions_doctags # yaml predictions evaluate \ scratch/DPBench/predicted_documents/yaml \ - scratch/DPBench/external_evaluations_yaml + scratch/DPBench/external_predictions_yaml + + +# Visualisations +visualize \ + scratch/DPBench/predicted_documents/json \ + scratch/DPBench/external_predictions_visualisations diff --git a/tests/test_predictions_visualizer.py b/tests/test_predictions_visualizer.py new file mode 100644 index 00000000..cdce9a01 --- /dev/null +++ b/tests/test_predictions_visualizer.py @@ -0,0 +1,64 @@ +from pathlib import Path + +import pytest +from datasets import load_dataset + +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction +from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer + + +def _first_doc_id(parquet_root: Path) -> str: + split_files = sorted((parquet_root / "test").glob("*.parquet")) + ds = load_dataset( + "parquet", data_files={"test": [str(path) for path in split_files]} + ) + record = DatasetRecordWithPrediction.model_validate(ds["test"][0]) + return record.doc_id + + +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_predictions_visualizer_with_embedded_predictions() -> None: + dataset_dir = Path("scratch/DPBench/eval_dataset_e2e") + output_dir = Path("scratch/DPBench/visualizer_tests/embedded") + output_dir.mkdir(parents=True, exist_ok=True) + + visualizer = PredictionsVisualizer(visualizations_dir=output_dir) + visualizer.create_visualizations( + dataset_dir=dataset_dir, + split="test", + begin_index=0, + end_index=1, + ) + + doc_id = _first_doc_id(dataset_dir) + layout_file = output_dir / f"{doc_id}_layout.html" + assert layout_file.is_file() + + +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_predictions_visualizer_with_external_predictions() -> None: + gt_dir = Path("scratch/DPBench/gt_dataset") + external_predictions_dir = Path("scratch/DPBench/predicted_documents/json") + output_dir = Path("scratch/DPBench/visualizer_tests/external") + output_dir.mkdir(parents=True, exist_ok=True) + + visualizer = PredictionsVisualizer( + visualizations_dir=output_dir, + external_predictions_dir=external_predictions_dir, + ) + visualizer.create_visualizations( + dataset_dir=gt_dir, + split="test", + begin_index=0, + end_index=1, + ) + + doc_id = _first_doc_id(gt_dir) + layout_file = output_dir / f"{doc_id}_layout.html" + assert layout_file.is_file()