diff --git a/docling_eval/campaign_tools/README_cvat_evaluation_pipeline.md b/docling_eval/campaign_tools/README_cvat_evaluation_pipeline.md index 6539a226..1ca02129 100644 --- a/docling_eval/campaign_tools/README_cvat_evaluation_pipeline.md +++ b/docling_eval/campaign_tools/README_cvat_evaluation_pipeline.md @@ -38,6 +38,7 @@ python cvat_evaluation_pipeline.py [OPTIONS] - `--pred-xml PATH`: Path to prediction CVAT XML file - `--step {gt,pred,eval,full}`: Pipeline step to run (default: full) - `--modalities {layout,document_structure}`: Evaluation modalities to run (default: both) +- `--strict`: Strict mode - require all images to have annotations in XML files (default: allow partial annotation batches) - `--verbose, -v`: Enable verbose logging ## Examples @@ -104,6 +105,25 @@ python cvat_evaluation_pipeline.py \ --modalities document_structure ``` +### 5. Strict Mode + +By default, the pipeline allows partial annotation batches where not all images need to have annotations in the XML file. This is useful when you have a large set of images but only a subset has been annotated. + +To enforce that ALL images must have annotations, use the `--strict` flag: + +```bash +python cvat_evaluation_pipeline.py \ + /path/to/images \ + /path/to/output \ + --gt-xml /path/to/complete_annotations.xml \ + --strict +``` + +In strict mode: +- The pipeline will fail with an error if any image lacks annotations +- Useful for validating complete annotation batches +- Helps catch missing annotations early in the process + ## Output Structure The pipeline creates the following directory structure in the output directory: diff --git a/docling_eval/campaign_tools/combine_cvat_evaluations.py b/docling_eval/campaign_tools/combine_cvat_evaluations.py index c6dd4034..c72154b2 100644 --- a/docling_eval/campaign_tools/combine_cvat_evaluations.py +++ b/docling_eval/campaign_tools/combine_cvat_evaluations.py @@ -8,9 +8,10 @@ * evaluation_CVAT_layout.json - layout-level metrics (`evaluations_per_image`) * evaluation_CVAT_document_structure.json - document-structure metrics (`evaluations`) +* evaluation_CVAT_key_value.json - key-value extraction metrics (`evaluations`) * file_name_user_id.csv - staff self-confidence / provenance table -The script matches the three sources by a **document id** that is derived from +The script matches the four sources by a **document id** that is derived from an image / doc name **without the file-extension** and we produde single table. Usage @@ -18,6 +19,7 @@ python combine_cvat_evaluations.py \ --layout_json evaluation_results/evaluation_CVAT_layout.json \ --docstruct_json evaluation_results/evaluation_CVAT_document_structure.json \ + --keyvalue_json evaluation_results/evaluation_CVAT_key_value.json \ --user_csv file_name_user_id.csv \ --out combined_evaluation.xlsx @@ -45,6 +47,20 @@ def _to_doc_id(path_like: str) -> str: return stem +def load_tables(json_path: Path) -> pd.DataFrame: + """Load evaluation_CVAT_tables.json and return a DataFrame.""" + with open(json_path, "r", encoding="utf-8") as fh: + data = json.load(fh) + if "evaluations" not in data: + raise KeyError( + "The supplied tables evaluation JSON does not contain the 'evaluations' field." + ) + df = pd.DataFrame(data["evaluations"]) + # The evaluator writes consistent doc_id (image stem). No further mapping needed. + df["doc_id"] = df["doc_id"].astype(str) + return df + + def load_layout(json_path: Path) -> pd.DataFrame: """Load *evaluation_CVAT_layout.json* and return a DataFrame.""" with open(json_path, "r", encoding="utf-8") as fh: @@ -103,6 +119,28 @@ def load_doc_structure(json_path: Path) -> pd.DataFrame: return df +def load_key_value(json_path: Path) -> pd.DataFrame: + """Load *evaluation_CVAT_key_value.json* and return a DataFrame.""" + with open(json_path, "r", encoding="utf-8") as fh: + data = json.load(fh) + + if "evaluations" not in data: + raise KeyError( + "The supplied key-value evaluation JSON does not contain the " + "'evaluations' field." + ) + + # Convert the evaluations list to a DataFrame + # The evaluations is a list of KeyValueEvaluation objects + evaluations_list = [] + for eval_data in data["evaluations"]: + evaluations_list.append(eval_data) + + df = pd.DataFrame(evaluations_list) + + return df + + def load_user_table(csv_path: Path) -> pd.DataFrame: """Load *file_name_user_id.csv* (staff provenance) and return a DataFrame.""" df = pd.read_csv(csv_path) @@ -130,15 +168,53 @@ def load_user_table(csv_path: Path) -> pd.DataFrame: def merge_tables( layout_df: pd.DataFrame, doc_df: pd.DataFrame, + keyvalue_df: Optional[pd.DataFrame] = None, user_df: Optional[pd.DataFrame] = None, + tables_df: Optional[pd.DataFrame] = None, ) -> pd.DataFrame: - """ - Merge layout, document structure, and optionally user dataframes. - If user_df is None, skip merging user columns. - """ df = layout_df.merge( doc_df[["doc_id", "edit_distance_struct"]], on="doc_id", how="outer" ) + + if keyvalue_df is not None: + df = df.merge( + keyvalue_df[ + [ + "doc_id", + "entity_f1", + "relation_f1", + "num_entity_diff", + "num_entity_diff_normalized", + "num_link_diff", + "num_link_diff_normalized", + ] + ], + on="doc_id", + how="left", + ) + + if tables_df is not None: + df = df.merge( + tables_df[ + [ + "doc_id", + "row_count_abs_diff_sum", + "col_count_abs_diff_sum", + "merge_count_abs_diff_sum", + "sem_body_f1", + "sem_row_section_f1", + "sem_row_header_f1", + "sem_col_header_f1", + "tables_unmatched", + "table_pairs", + "orphan_table_annotation_A", + "orphan_table_annotation_B", + ] + ], + on="doc_id", + how="left", + ) + if user_df is not None: df = df.merge( user_df[["doc_id", "annotator_id", "self_confidence", "image_name"]], @@ -146,47 +222,67 @@ def merge_tables( how="left", suffixes=("", "_user"), ) - # the self_confidence column numeric df["self_confidence"] = pd.to_numeric(df["self_confidence"], errors="coerce") - # confidence difference between the annotators df["diff_self_confidence"] = df.groupby("doc_id")["self_confidence"].transform( lambda x: x.max() - x.min() ) - # to check the self-confidence values - avg_self_confidence = df["self_confidence"].mean() - std_self_confidence = df["self_confidence"].std() - quantiles = df["self_confidence"].quantile([0.01, 0.5, 0.99]) - - print(f"Average self-confidence: {avg_self_confidence:.4f}") - print(f"Standard deviation: {std_self_confidence:.4f}") - print( - f"Quantiles (1%, 50%, 99%): {quantiles[0.01]:.4f}, {quantiles[0.5]:.4f}, {quantiles[0.99]:.4f}" - ) - - # we can re-order the most relevant columns towards the front. preferred_order = [ "doc_id", - "image_name", # from user table (includes extension) - "avg_weighted_label_matched_iou_50", # "value" in layout JSON file - "segmentation_f1", # may or may not exist, depending on evaluation config + "image_name", + "avg_weighted_label_matched_iou_50", + "segmentation_f1", "edit_distance_struct", + # table metrics (consolidated) + "row_count_abs_diff_sum", + "col_count_abs_diff_sum", + "merge_count_abs_diff_sum", + "sem_body_f1", + "sem_row_section_f1", + "sem_row_header_f1", + "sem_col_header_f1", + "tables_unmatched", + "table_pairs", + "orphan_table_annotation_A", + "orphan_table_annotation_B", + # key-values & misc + "entity_f1", + "relation_f1", "map_val", "annotator_id", "self_confidence", ] - ordered_cols = [c for c in preferred_order if c in df.columns] + [ + ordered = [c for c in preferred_order if c in df.columns] + [ c for c in df.columns if c not in preferred_order ] - df = df[ordered_cols] + df = df[ordered] - # filter columns to be present, and remove the rest filter_cols = [ "doc_id", "segmentation_f1", "segmentation_f1_no_pictures", "avg_weighted_label_matched_iou_50", "edit_distance_struct", + # consolidated table metrics + "row_count_abs_diff_sum", + "col_count_abs_diff_sum", + "merge_count_abs_diff_sum", + "sem_body_f1", + "sem_row_section_f1", + "sem_row_header_f1", + "sem_col_header_f1", + "tables_unmatched", + "table_pairs", + "orphan_table_annotation_A", + "orphan_table_annotation_B", + # key-values + "entity_f1", + "relation_f1", + "num_entity_diff", + "num_entity_diff_normalized", + "num_link_diff", + "num_link_diff_normalized", + # existing counts "annotator_id", "self_confidence", "diff_self_confidence", @@ -200,53 +296,68 @@ def merge_tables( "table_count_diff", "picture_count_diff", ] - df = df[[col for col in filter_cols if col in df.columns]] - + df = df[[c for c in filter_cols if c in df.columns]] return df def _write_as_excel_table(df: pd.DataFrame, path: Path) -> None: """ - Write *df* to *path* as an Excel **Table** and append five derived columns: - - layout_different → 1 if segmentation_f1_no_pictures < 0.9 else "" - structure_different→ 1 if edit_distance_struct ≥ 10 else "" - tables_different → 1 if table_count_diff ≥ 1 else "" - pictures_different → 1 if picture_count_diff ≥ 2 else "" - need_review → SUM of the four flags above (0-4) + Write *df* to *path* as an Excel **Table** and append derived columns: + + layout_different → 1 if segmentation or label-IoU is low + structure_different → 1 if edit distance is high + tables_different → 1 if table count differs + pictures_different → 1 if picture count differs + key_values_different → 1 if entity/relation F1 is low (and non-zero) + table_struct_different → 1 if any table structure count diffs sum to > 0 + table_semantic_different → 1 if table_pairs > 0 and any semantic F1 < 0.9 + need_review → SUM of selected flags (kept as in your current sheet) """ + from xlsxwriter.utility import xl_range + df = df.copy() - # append empty placeholders – the formulas will fill them extra_cols = [ "layout_different", "structure_different", "tables_different", "pictures_different", + "key_values_different", + "table_struct_different", + "table_semantic_different", "need_review", + "need_task2_review", ] for col in extra_cols: - df[col] = "" + if col not in df.columns: + df[col] = "" with pd.ExcelWriter(path, engine="xlsxwriter") as writer: - # 1️⃣ dump the data frame df.to_excel(writer, sheet_name="Evaluation", index=False) - # 2️⃣ grab handles we need - wb = writer.book ws = writer.sheets["Evaluation"] - n_rows, n_cols = df.shape # includes derived columns + n_rows, n_cols = df.shape - # 3️⃣ build column specs with formulas for the derived columns column_settings = [{"header": h} for h in df.columns] - # Tune these thresholds to your liking: col_formula = { - "layout_different": '=IF(OR([@[segmentation_f1_no_pictures]]<0.9, [@[avg_weighted_label_matched_iou_50]]<0.9),1,"")', + "layout_different": '=IF(OR([@[segmentation_f1_no_pictures]]<0.9,[@[avg_weighted_label_matched_iou_50]]<0.9),1,"")', "structure_different": '=IF([@[edit_distance_struct]]>=10,1,"")', "tables_different": '=IF([@[table_count_diff]]>=1,1,"")', "pictures_different": '=IF([@[picture_count_diff]]>=2,1,"")', + "key_values_different": '=IF(OR([@[num_entity_diff]]>2,[@[num_link_diff]]>2,AND([@[entity_f1]]<>0,[@[entity_f1]]<0.95),AND([@[relation_f1]]<>0,[@[relation_f1]]<0.95)),1,"")', + # 1) any structure count diffs sum to > 0 + "table_struct_different": ( + "=IF(SUM([@[row_count_abs_diff_sum]],[@[col_count_abs_diff_sum]]," + '[@[merge_count_abs_diff_sum]])>0,1,"")' + ), + # 2) gate semantics on table_pairs > 0, then flag if any F1 < 0.9 + "table_semantic_different": ( + "=IF(AND([@[table_pairs]]>0,OR([@[sem_col_header_f1]]<0.9," + '[@[sem_row_header_f1]]<0.9,[@[sem_row_section_f1]]<0.9,[@[sem_body_f1]]<0.9)),1,"")' + ), "need_review": "=SUM([@[layout_different]]:[@[pictures_different]])", + "need_task2_review": "=SUM([@[key_values_different]]:[@[table_semantic_different]],[@[tables_unmatched]])", } for spec in column_settings: @@ -254,27 +365,24 @@ def _write_as_excel_table(df: pd.DataFrame, path: Path) -> None: if hdr in col_formula: spec["formula"] = col_formula[hdr] - # 4️⃣ add the Excel Table (auto-filter, style, formulas copied down) - table_range = xl_range(0, 0, n_rows, n_cols - 1) # zero-based, incl. header + table_range = xl_range(0, 0, n_rows, n_cols - 1) ws.add_table( table_range, { "name": "EvaluationTbl", "header_row": True, "columns": column_settings, - # 👇 style must be a string such as "TableStyleMedium2" "style": "TableStyleMedium2", }, ) - # 5️⃣ quality-of-life tweaks ws.freeze_panes(1, 0) - ws.autofit() + ws.autofit() # type: ignore[attr-defined] def build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( - description="Combine CVAT layout & document-structure evaluation JSONs " + description="Combine CVAT layout, document-structure, and key-value evaluation JSONs " "with the staff provenance CSV into a single spreadsheet.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) @@ -290,6 +398,12 @@ def build_arg_parser() -> argparse.ArgumentParser: default=Path("evaluation_results/evaluation_CVAT_document_structure.json"), help="Path to evaluation_CVAT_document_structure.json", ) + p.add_argument( + "--keyvalue_json", + type=Path, + default=Path("evaluation_results/evaluation_CVAT_key_value.json"), + help="Path to evaluation_CVAT_key_value.json", + ) p.add_argument( "--user_csv", type=Path, @@ -312,15 +426,18 @@ def build_arg_parser() -> argparse.ArgumentParser: def combine_cvat_evaluations( layout_json: Path, docstruct_json: Path, + keyvalue_json: Optional[Path] = None, user_csv: Optional[Path] = None, + tables_json: Optional[Path] = None, out: Path = Path("combined_evaluation.xlsx"), ) -> pd.DataFrame: """ - Combine CVAT layout & document-structure evaluation JSONs with the staff provenance CSV into a single spreadsheet. + Combine CVAT layout, document-structure, and key-value evaluation JSONs with the staff provenance CSV into a single spreadsheet. Args: layout_json: Path to evaluation_CVAT_layout.json docstruct_json: Path to evaluation_CVAT_document_structure.json + keyvalue_json: Optional path to evaluation_CVAT_key_value.json user_csv: Optional path to file_name_user_id.csv out: Output file path; extension decides format (.xlsx for Excel, otherwise CSV) @@ -329,11 +446,16 @@ def combine_cvat_evaluations( """ layout_df = load_layout(layout_json) doc_df = load_doc_structure(docstruct_json) + tables_df = load_tables(tables_json) if tables_json is not None else None + + keyvalue_df: Optional[pd.DataFrame] = None + if keyvalue_json is not None: + keyvalue_df = load_key_value(keyvalue_json) user_df: Optional[pd.DataFrame] = None if user_csv is not None: user_df = load_user_table(user_csv) - combined_df = merge_tables(layout_df, doc_df, user_df) + combined_df = merge_tables(layout_df, doc_df, keyvalue_df, user_df, tables_df) if out.suffix.lower() == ".xlsx": # combined_df.to_excel(out, index=False) @@ -350,6 +472,7 @@ def main() -> None: combine_cvat_evaluations( layout_json=args.layout_json, docstruct_json=args.docstruct_json, + keyvalue_json=args.keyvalue_json, user_csv=args.user_csv, out=args.out, ) diff --git a/docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py b/docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py index 6d786277..8ee8b755 100644 --- a/docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py +++ b/docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py @@ -10,12 +10,13 @@ This is useful for preparing large-scale annotation tasks for CVAT or similar tools. Usage: - uv run python scratches/scratch_46.py --input-directory --output-directory [--sliding-window ] + uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory --output-directory [--sliding-window ] [--use-predictions/--no-use-predictions] Arguments: input_directory: Root directory containing subdirectories with files to process output_directory: Where to store the generated datasets (one subdir per input subdir) sliding_window: Number of pages per CVAT task (default: 1) + use_predictions: Whether to create prediction dataset and use predictions in CVAT (default: True) """ from pathlib import Path @@ -30,7 +31,10 @@ def process_subdirectories( - input_directory: Path, output_directory: Path, sliding_window: int = 1 + input_directory: Path, + output_directory: Path, + sliding_window: int = 1, + use_predictions: bool = True, ) -> None: """ For each subdirectory in input_directory, create gt_dataset, eval_dataset, and cvat_dataset_preannotated @@ -40,6 +44,7 @@ def process_subdirectories( input_directory: Root directory with subdirectories to process output_directory: Where to store generated datasets sliding_window: Number of pages per CVAT task (default: 1) + use_predictions: Whether to create prediction dataset and use predictions in CVAT """ input_directory = input_directory.expanduser().resolve() output_directory = output_directory.expanduser().resolve() @@ -71,26 +76,33 @@ def process_subdirectories( else: typer.echo(f" GT dataset already exists, skipping.") - if not eval_dir.exists(): - typer.echo(f" Creating prediction dataset (Docling)...") - create_eval( - benchmark=BenchMarkNames.PLAIN_FILES, - output_dir=odir, - prediction_provider=PredictionProviderType.DOCLING, - do_visualization=True, - image_scale_factor=2.0, - do_table_structure=False, - ) + if use_predictions: + if not eval_dir.exists(): + typer.echo(f" Creating prediction dataset (Docling)...") + create_eval( + benchmark=BenchMarkNames.PLAIN_FILES, + output_dir=odir, + prediction_provider=PredictionProviderType.DOCLING, + do_visualization=True, + image_scale_factor=2.0, + do_table_structure=False, + ) + else: + typer.echo(f" Prediction dataset already exists, skipping.") else: - typer.echo(f" Prediction dataset already exists, skipping.") + typer.echo( + f" Skipping prediction dataset creation (use_predictions=False)." + ) if not cvat_dir.exists(): typer.echo(f" Creating CVAT pre-annotated dataset...") + # Use gt_dir when no predictions, eval_dir when using predictions + source_dir = (eval_dir / "test") if use_predictions else (gt_dir / "test") create_cvat( - gt_dir=eval_dir / "test", + gt_dir=source_dir, output_dir=cvat_dir, bucket_size=100, - use_predictions=True, + use_predictions=use_predictions, sliding_window=sliding_window, ) else: @@ -114,11 +126,16 @@ def batch_prepare( sliding_window: int = typer.Option( 1, help="Number of pages per CVAT task (default: 1)" ), + use_predictions: bool = typer.Option( + True, help="Whether to create prediction dataset and use predictions in CVAT" + ), ) -> None: """ Batch-create Docling evaluation datasets for all subdirectories in input_directory. """ - process_subdirectories(input_directory, output_directory, sliding_window) + process_subdirectories( + input_directory, output_directory, sliding_window, use_predictions + ) typer.echo("\nAll benchmarks created successfully!") diff --git a/docling_eval/campaign_tools/cvat_evaluation_pipeline.py b/docling_eval/campaign_tools/cvat_evaluation_pipeline.py index 600c2ac2..e5a573ef 100755 --- a/docling_eval/campaign_tools/cvat_evaluation_pipeline.py +++ b/docling_eval/campaign_tools/cvat_evaluation_pipeline.py @@ -20,9 +20,13 @@ from docling_eval.campaign_tools.combine_cvat_evaluations import ( combine_cvat_evaluations, ) +from docling_eval.campaign_tools.evaluate_cvat_tables import evaluate_tables from docling_eval.cli.main import evaluate from docling_eval.cvat_tools.cvat_to_docling import convert_cvat_to_docling -from docling_eval.cvat_tools.parser import MissingImageInCVATXML +from docling_eval.cvat_tools.parser import ( + MissingImageInCVATXML, + get_all_images_from_cvat_xml, +) from docling_eval.datamodels.types import ( BenchMarkNames, EvaluationModality, @@ -41,16 +45,18 @@ class CVATEvaluationPipeline: """Pipeline for CVAT annotation evaluation.""" - def __init__(self, images_dir: Path, output_dir: Path): + def __init__(self, images_dir: Path, output_dir: Path, strict: bool = False): """ Initialize the pipeline. Args: images_dir: Directory containing PNG image files output_dir: Base directory for all pipeline outputs + strict: If True, require all images to have annotations (default: False) """ self.images_dir = Path(images_dir) self.output_dir = Path(output_dir) + self.strict = strict # Create subdirectories self.gt_json_dir = self.output_dir / "ground_truth_json" @@ -93,9 +99,58 @@ def _convert_cvat_to_json( output_json_dir.mkdir(parents=True, exist_ok=True) json_files = [] + if self.strict: + _log.info("Running in STRICT mode: all images must have annotations") + else: + _log.info("Running in NORMAL mode: partial annotation batches allowed") + + # Get all images available in the directory image_files = self._find_image_files() - for image_path in image_files: + # Get all images that have annotations in the CVAT XML + try: + annotated_images = set(get_all_images_from_cvat_xml(cvat_xml_path)) + except Exception as e: + _log.error(f"Failed to read CVAT XML {cvat_xml_path}: {e}") + return [] + + # Filter to only process images that have annotations (unless in strict mode) + if self.strict: + # In strict mode, require all images to have annotations + missing_images = [ + img.name for img in image_files if img.name not in annotated_images + ] + if missing_images: + _log.error( + f"Strict mode: Found {len(missing_images)} images without annotations in {cvat_xml_path.name}" + ) + _log.error( + f"Missing annotations for: {', '.join(missing_images[:10])}" + + ("..." if len(missing_images) > 10 else "") + ) + raise ValueError( + f"Strict mode enabled: {len(missing_images)} images lack annotations in {cvat_xml_path.name}" + ) + images_to_process = image_files + _log.info( + f"Strict mode: All {len(image_files)} image files have annotations in {cvat_xml_path.name}" + ) + else: + # Normal mode: allow partial annotation batches + images_to_process = [ + img for img in image_files if img.name in annotated_images + ] + skipped_count = len(image_files) - len(images_to_process) + + _log.info( + f"Found {len(image_files)} image files, {len(images_to_process)} have annotations in {cvat_xml_path.name}" + ) + if skipped_count > 0: + _log.info( + f"Skipping {skipped_count} images without annotations (expected for partial annotation batches)" + ) + + for image_path in images_to_process: _log.info( f"Converting {image_path.name} with annotations from {cvat_xml_path.name}" ) @@ -122,11 +177,19 @@ def _convert_cvat_to_json( _log.warning(f"\u26a0 Failed to convert {image_path.name}") except MissingImageInCVATXML: - _log.warning( - f"Image {image_path.name} not found in {cvat_xml_path.name}. " - "This is expected for partial annotation batches. Skipping." - ) - continue + if self.strict: + # In strict mode, this is a fatal error + _log.error( + f"Strict mode: Image {image_path.name} not found in {cvat_xml_path.name}" + ) + raise + else: + # In normal mode, this should be unexpected due to pre-filtering + _log.error( + f"Unexpected: Image {image_path.name} was pre-filtered but not found in {cvat_xml_path.name}. " + "This suggests an issue with the filtering logic." + ) + continue except ValueError as ve: _log.error(f"\u2717 Error processing {image_path.name}: {ve}") continue @@ -204,6 +267,42 @@ def create_prediction_dataset(self, pred_cvat_xml: Path) -> None: ) _log.info(f"✓ Prediction dataset created: {self.eval_dataset_dir}") + def run_table_evaluation( + self, + gt_cvat_xml: Path, + pred_cvat_xml: Path, + out_json: Optional[Path] = None, + containment_thresh: float = 0.50, + table_pair_iou: float = 0.20, + sem_match_iou: float = 0.30, + ) -> Path: + """ + Run the table structure/semantics evaluation directly on the two CVAT XMLs. + + Writes a JSON file (default: evaluation_results/evaluation_CVAT_tables.json) and returns its path. + """ + _log.info("=== Running Table Evaluation ===") + + if out_json is None: + out_json = self.evaluation_results_dir / "evaluation_CVAT_tables.json" + + self.evaluation_results_dir.mkdir(parents=True, exist_ok=True) + + result = evaluate_tables( + set_a=gt_cvat_xml, + set_b=pred_cvat_xml, + containment_thresh=containment_thresh, + table_pair_iou=table_pair_iou, + sem_match_iou=sem_match_iou, + ) + + out_json.write_text( + json.dumps(result.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + _log.info(f"✓ Tables evaluation written to: {out_json}") + return out_json + def run_evaluation( self, modalities: Optional[List[str]] = None, user_csv: Optional[Path] = None ) -> None: @@ -279,11 +378,15 @@ def run_evaluation( docstruct_json = ( self.evaluation_results_dir / "evaluation_CVAT_document_structure.json" ) + key_value_json = self.evaluation_results_dir / "evaluation_CVAT_key_value.json" + tables_json = self.evaluation_results_dir / "evaluation_CVAT_tables.json" _log.info(f"Combining evaluation results to {combined_out}") combine_cvat_evaluations( layout_json=layout_json, docstruct_json=docstruct_json, + keyvalue_json=key_value_json, user_csv=user_csv, + tables_json=tables_json, out=combined_out, ) @@ -309,6 +412,7 @@ def run_full_pipeline( try: self.create_ground_truth_dataset(gt_cvat_xml) self.create_prediction_dataset(pred_cvat_xml) + self.run_table_evaluation(gt_cvat_xml, pred_cvat_xml) self.run_evaluation(modalities, user_csv) # Combine results if user_csv is provided @@ -317,11 +421,18 @@ def run_full_pipeline( docstruct_json = ( self.evaluation_results_dir / "evaluation_CVAT_document_structure.json" ) + key_value_json = ( + self.evaluation_results_dir / "evaluation_CVAT_key_value.json" + ) + tables_json = self.evaluation_results_dir / "evaluation_CVAT_tables.json" + _log.info(f"Combining evaluation results to {combined_out}") combine_cvat_evaluations( layout_json=layout_json, docstruct_json=docstruct_json, + keyvalue_json=key_value_json, user_csv=user_csv, + tables_json=tables_json, out=combined_out, ) @@ -364,16 +475,16 @@ def main(): parser.add_argument( "--step", - choices=["gt", "pred", "eval", "full"], + choices=["gt", "pred", "tables", "eval", "full"], default="full", - help="Pipeline step to run: gt (ground truth), pred (predictions), eval (evaluation), full (all steps)", + help="Pipeline step to run: gt (ground truth), pred (predictions), tables (table eval only), eval, or full.", ) parser.add_argument( "--modalities", nargs="+", - choices=["layout", "document_structure"], - default=["layout", "document_structure"], + choices=["layout", "document_structure", "key_value"], + default=["layout", "document_structure", "key_value"], help="Evaluation modalities to run", ) @@ -381,6 +492,12 @@ def main(): "--verbose", "-v", action="store_true", help="Enable verbose logging" ) + parser.add_argument( + "--strict", + action="store_true", + help="Strict mode: require all images to have annotations in XML files (default: allow partial annotation batches)", + ) + args = parser.parse_args() if args.verbose: @@ -392,7 +509,9 @@ def main(): sys.exit(1) # Initialize pipeline - pipeline = CVATEvaluationPipeline(args.images_dir, args.output_dir) + pipeline = CVATEvaluationPipeline( + args.images_dir, args.output_dir, strict=args.strict + ) if args.step == "gt": if not args.gt_xml: @@ -411,7 +530,17 @@ def main(): _log.error(f"Prediction XML file does not exist: {args.pred_xml}") sys.exit(1) pipeline.create_prediction_dataset(args.pred_xml) - + elif args.step == "tables": + if not args.gt_xml or not args.pred_xml: + _log.error("Both --gt-xml and --pred-xml are required for tables step") + sys.exit(1) + if not args.gt_xml.exists(): + _log.error(f"Ground truth XML file does not exist: {args.gt_xml}") + sys.exit(1) + if not args.pred_xml.exists(): + _log.error(f"Prediction XML file does not exist: {args.pred_xml}") + sys.exit(1) + pipeline.run_table_evaluation(args.gt_xml, args.pred_xml) elif args.step == "eval": pipeline.run_evaluation(args.modalities, user_csv=args.user_csv) diff --git a/docling_eval/campaign_tools/evaluate_cvat_tables.py b/docling_eval/campaign_tools/evaluate_cvat_tables.py new file mode 100644 index 00000000..60f99d07 --- /dev/null +++ b/docling_eval/campaign_tools/evaluate_cvat_tables.py @@ -0,0 +1,401 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +import typer +from docling_core.types.doc.base import BoundingBox +from docling_core.types.doc.labels import DocItemLabel +from pydantic import BaseModel, Field + +from docling_eval.cvat_tools.document import DocumentStructure +from docling_eval.cvat_tools.models import CVATElement, TableStructLabel + +DEFAULT_TABLE_PAIR_IOU: float = 0.20 +DEFAULT_CONTAINMENT_THRESH: float = 0.50 +DEFAULT_SEM_MATCH_IOU: float = 0.30 + + +def iou(a: BoundingBox, b: BoundingBox) -> float: + return a.intersection_over_union(b) + + +def inter_area(a: BoundingBox, b: BoundingBox) -> float: + return a.intersection_area_with(b) + + +def area(bb: BoundingBox) -> float: + return bb.area() + + +def inside_with_tolerance( + child: BoundingBox, parent: BoundingBox, thresh: float +) -> bool: + a = area(child) + if a <= 0.0: + return False + return (inter_area(child, parent) / a) >= thresh + + +class SemClass(str, Enum): + COL_HEADER = "col_header" + ROW_HEADER = "row_header" + ROW_SECTION = "row_section" + BODY = "body" + + +SEM_TO_TABLE_LABEL: dict[SemClass, TableStructLabel] = { + SemClass.COL_HEADER: TableStructLabel.COL_HEADER, + SemClass.ROW_HEADER: TableStructLabel.ROW_HEADER, + SemClass.ROW_SECTION: TableStructLabel.ROW_SECTION, + SemClass.BODY: TableStructLabel.BODY, +} + + +@dataclass +class TableStruct: + table_el: CVATElement + rows: list[CVATElement] + cols: list[CVATElement] + merges: list[CVATElement] + sem: dict[SemClass, list[CVATElement]] + + +class TablePairMetrics(BaseModel): + row_count_diff: int + col_count_diff: int + merge_count_diff: int + sem_f1: dict[SemClass, float] + + +class ImageTablesEvaluation(BaseModel): + # identifier used for joining in the combiner + doc_id: str + + # kept metrics + row_count_abs_diff_sum: int = 0 + col_count_abs_diff_sum: int = 0 + merge_count_abs_diff_sum: int = 0 + + sem_body_f1: float = 0.0 + sem_row_section_f1: float = 0.0 + sem_row_header_f1: float = 0.0 + sem_col_header_f1: float = 0.0 + + table_pairs: int = 0 + tables_unmatched: int = 0 + + orphan_table_annotation_A: int = 0 + orphan_table_annotation_B: int = 0 + + +class TablesEvaluationRun(BaseModel): + evaluations: list[ImageTablesEvaluation] = Field(default_factory=list) + + +def list_images_in_xml(xml_path: Path) -> list[str]: + import xml.etree.ElementTree as ET + + root = ET.parse(xml_path).getroot() + result: list[str] = [] + for img in root.findall(".//image"): + name = img.get("name") + if name: + result.append(name) + return result + + +def _elements_by_label( + elements: Sequence[CVATElement], label: object +) -> list[CVATElement]: + return [e for e in elements if e.label == label] + + +def _collect_tables( + doc: DocumentStructure, contain_thresh: float +) -> tuple[list[TableStruct], list[CVATElement]]: + tables = _elements_by_label(doc.elements, DocItemLabel.TABLE) + result: list[TableStruct] = [] + + pool_rows = _elements_by_label(doc.elements, TableStructLabel.TABLE_ROW) + pool_cols = _elements_by_label(doc.elements, TableStructLabel.TABLE_COLUMN) + pool_merges = _elements_by_label(doc.elements, TableStructLabel.TABLE_MERGED_CELL) + pool_sem: dict[SemClass, list[CVATElement]] = { + sc: _elements_by_label(doc.elements, lab) + for sc, lab in SEM_TO_TABLE_LABEL.items() + } + + for t in tables: + tb = t.bbox + rows = [ + e for e in pool_rows if inside_with_tolerance(e.bbox, tb, contain_thresh) + ] + cols = [ + e for e in pool_cols if inside_with_tolerance(e.bbox, tb, contain_thresh) + ] + merges = [ + e for e in pool_merges if inside_with_tolerance(e.bbox, tb, contain_thresh) + ] + sem = { + sc: [ + e + for e in pool_sem[sc] + if inside_with_tolerance(e.bbox, tb, contain_thresh) + ] + for sc in SemClass + } + result.append( + TableStruct(table_el=t, rows=rows, cols=cols, merges=merges, sem=sem) + ) + + all_tables_bb = [t.table_el.bbox for t in result] + + def not_in_any_table(el: CVATElement) -> bool: + return not any( + inside_with_tolerance(el.bbox, tb, contain_thresh) for tb in all_tables_bb + ) + + orphans = [ + e + for e in pool_rows + pool_cols + pool_merges + sum(pool_sem.values(), []) + if not_in_any_table(e) + ] + return result, orphans + + +def _pair_tables( + a: list[TableStruct], + b: list[TableStruct], + iou_thresh: float, +) -> tuple[list[tuple[TableStruct, TableStruct]], list[TableStruct], list[TableStruct]]: + if not a or not b: + return [], a[:], b[:] + + candidates: list[tuple[int, int, float]] = [] + for i, ta in enumerate(a): + for j, tb in enumerate(b): + candidates.append((i, j, iou(ta.table_el.bbox, tb.table_el.bbox))) + candidates.sort(key=lambda t: t[2], reverse=True) + + used_a: set[int] = set() + used_b: set[int] = set() + matched: list[tuple[TableStruct, TableStruct]] = [] + for i, j, s in candidates: + if s < iou_thresh: + break + if i in used_a or j in used_b: + continue + matched.append((a[i], b[j])) + used_a.add(i) + used_b.add(j) + + unmatched_a = [a[i] for i in range(len(a)) if i not in used_a] + unmatched_b = [b[j] for j in range(len(b)) if j not in used_b] + return matched, unmatched_a, unmatched_b + + +def _greedy_intersection_sum( + a: Sequence[BoundingBox], b: Sequence[BoundingBox], iou_thresh: float +) -> float: + if not a or not b: + return 0.0 + pairs: list[tuple[int, int, float]] = [] + for i, ba in enumerate(a): + for j, bb in enumerate(b): + v = iou(ba, bb) + if v >= iou_thresh: + pairs.append((i, j, v)) + pairs.sort(key=lambda t: t[2], reverse=True) + used_i: set[int] = set() + used_j: set[int] = set() + inter_sum = 0.0 + for i_idx, j_idx, _ in pairs: + if i_idx in used_i or j_idx in used_j: + continue + used_i.add(i_idx) + used_j.add(j_idx) + inter_sum += inter_area(a[i_idx], b[j_idx]) + return inter_sum + + +def _sem_f1( + a_boxes: list[BoundingBox], b_boxes: list[BoundingBox], iou_thresh: float +) -> float: + if not a_boxes and not b_boxes: + return 1.0 + if not a_boxes or not b_boxes: + return 0.0 + inter = _greedy_intersection_sum(a_boxes, b_boxes, iou_thresh=iou_thresh) + a_area = sum(area(bb) for bb in a_boxes) + b_area = sum(area(bb) for bb in b_boxes) + if a_area <= 0.0 or b_area <= 0.0: + return 0.0 + p = inter / a_area + r = inter / b_area + return 0.0 if (p + r) == 0.0 else (2.0 * p * r) / (p + r) + + +def _pair_metrics(ta: TableStruct, tb: TableStruct, sem_iou: float) -> TablePairMetrics: + sem_f1: dict[SemClass, float] = {} + for sc in SemClass: + a_boxes = [e.bbox for e in ta.sem.get(sc, [])] + b_boxes = [e.bbox for e in tb.sem.get(sc, [])] + sem_f1[sc] = _sem_f1(a_boxes, b_boxes, iou_thresh=sem_iou) + tpm = TablePairMetrics( + row_count_diff=abs(len(ta.rows) - len(tb.rows)), + col_count_diff=abs(len(ta.cols) - len(tb.cols)), + merge_count_diff=abs(len(ta.merges) - len(tb.merges)), + sem_f1=sem_f1, + ) + # print(f"Rows: A: {len(ta.rows)}, B: {len(tb.rows)}") + # print(f"Cols: A: {len(ta.cols)}, B: {len(tb.cols)}") + # print(f"Merges: A: {len(ta.merges)}, B: {len(tb.merges)}") + + return tpm + + +def _doc_id_from_image_name(image_name: str) -> str: + return Path(image_name).stem + + +def _orphans_count(orphans: list[CVATElement]) -> dict[str, int]: + out: dict[str, int] = { + "rows": 0, + "cols": 0, + "merges": 0, + "sem_col_header": 0, + "sem_row_header": 0, + "sem_row_section": 0, + "sem_body": 0, + } + for el in orphans: + if el.label == TableStructLabel.TABLE_ROW: + out["rows"] += 1 + elif el.label == TableStructLabel.TABLE_COLUMN: + out["cols"] += 1 + elif el.label == TableStructLabel.TABLE_MERGED_CELL: + out["merges"] += 1 + elif el.label == TableStructLabel.COL_HEADER: + out["sem_col_header"] += 1 + elif el.label == TableStructLabel.ROW_HEADER: + out["sem_row_header"] += 1 + elif el.label == TableStructLabel.ROW_SECTION: + out["sem_row_section"] += 1 + elif el.label == TableStructLabel.BODY: + out["sem_body"] += 1 + return out + + +def evaluate_image( + set_a_xml: Path, + set_b_xml: Path, + image_name: str, + containment_thresh: float, + table_pair_iou: float, + sem_match_iou: float, +) -> Optional[ImageTablesEvaluation]: + try: + doc_a = DocumentStructure.from_cvat_xml(set_a_xml, image_name) + doc_b = DocumentStructure.from_cvat_xml(set_b_xml, image_name) + except Exception: + return None + + tables_a, orphans_a = _collect_tables(doc_a, containment_thresh) + tables_b, orphans_b = _collect_tables(doc_b, containment_thresh) + + matched, ua, ub = _pair_tables(tables_a, tables_b, iou_thresh=table_pair_iou) + pair_metrics = [ + _pair_metrics(ta, tb, sem_iou=sem_match_iou) for (ta, tb) in matched + ] + + # Sums of absolute differences across matched table pairs + row_diff_sum = int(sum(pm.row_count_diff for pm in pair_metrics)) + col_diff_sum = int(sum(pm.col_count_diff for pm in pair_metrics)) + merge_diff_sum = int(sum(pm.merge_count_diff for pm in pair_metrics)) + + # Average semantic F1 over matched pairs (0 when no pairs) + def mean_f1(key: SemClass) -> float: + seq = [pm.sem_f1[key] for pm in pair_metrics] + return float(sum(seq)) / float(len(seq)) if seq else 0.0 + + return ImageTablesEvaluation( + doc_id=_doc_id_from_image_name(image_name), + row_count_abs_diff_sum=row_diff_sum, + col_count_abs_diff_sum=col_diff_sum, + merge_count_abs_diff_sum=merge_diff_sum, + sem_body_f1=mean_f1(SemClass.BODY), + sem_row_section_f1=mean_f1(SemClass.ROW_SECTION), + sem_row_header_f1=mean_f1(SemClass.ROW_HEADER), + sem_col_header_f1=mean_f1(SemClass.COL_HEADER), + table_pairs=len(matched), + tables_unmatched=(len(ua) + len(ub)), + orphan_table_annotation_A=len(orphans_a), + orphan_table_annotation_B=len(orphans_b), + ) + + +app = typer.Typer(help="Compare table structure/semantics between two CVAT XMLs.") + + +def evaluate_tables( + set_a: Path, + set_b: Path, + containment_thresh: float = DEFAULT_CONTAINMENT_THRESH, + table_pair_iou: float = DEFAULT_TABLE_PAIR_IOU, + sem_match_iou: float = DEFAULT_SEM_MATCH_IOU, +) -> "TablesEvaluationRun": + """ + Library entrypoint: evaluate tables across images present in both CVAT XMLs. + Returns the full evaluation model (no file I/O, no Typer types). + """ + imgs = sorted(set(list_images_in_xml(set_a)) & set(list_images_in_xml(set_b))) + evals: list[ImageTablesEvaluation] = [] + for name in imgs: + res = evaluate_image( + set_a_xml=set_a, + set_b_xml=set_b, + image_name=name, + containment_thresh=containment_thresh, + table_pair_iou=table_pair_iou, + sem_match_iou=sem_match_iou, + ) + if res is not None: + evals.append(res) + return TablesEvaluationRun(evaluations=evals) + + +@app.command() +def run( + set_a: Path = typer.Option( + ..., exists=True, readable=True, help="CVAT XML (Set A)" + ), + set_b: Path = typer.Option( + ..., exists=True, readable=True, help="CVAT XML (Set B)" + ), + out: Path = typer.Option( + Path("evaluation_results/evaluation_CVAT_tables.json"), help="Output JSON" + ), + containment_thresh: float = typer.Option( + DEFAULT_CONTAINMENT_THRESH, min=0.0, max=1.0 + ), + table_pair_iou: float = typer.Option(DEFAULT_TABLE_PAIR_IOU, min=0.0, max=1.0), + sem_match_iou: float = typer.Option(DEFAULT_SEM_MATCH_IOU, min=0.0, max=1.0), +) -> None: + result = evaluate_tables( + set_a=set_a, + set_b=set_b, + containment_thresh=containment_thresh, + table_pair_iou=table_pair_iou, + sem_match_iou=sem_match_iou, + ) + out.parent.mkdir(parents=True, exist_ok=True) + payload = result.model_dump(mode="json") + out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Wrote {out.resolve()} ({len(result.evaluations)} images)") + + +if __name__ == "__main__": + app() diff --git a/docling_eval/cvat_tools/cvat_to_docling.py b/docling_eval/cvat_tools/cvat_to_docling.py index e910acd8..1ad15649 100644 --- a/docling_eval/cvat_tools/cvat_to_docling.py +++ b/docling_eval/cvat_tools/cvat_to_docling.py @@ -44,6 +44,7 @@ from docling_eval.cvat_tools.document import DocumentStructure from docling_eval.cvat_tools.models import CVATElement +from docling_eval.cvat_tools.parser import MissingImageInCVATXML from docling_eval.cvat_tools.tree import ( TreeNode, apply_reading_order_to_tree, @@ -750,7 +751,7 @@ def _create_item_by_label( parent: Optional[NodeItem], ) -> Optional[DocItem]: """Create appropriate DocItem based on element label.""" - content_layer = ContentLayer(element.content_layer.lower()) + content_layer = element.content_layer if doc_label == DocItemLabel.TITLE: return self.doc.add_title( @@ -1124,7 +1125,6 @@ def convert_cvat_to_docling( DoclingDocument or None if conversion fails """ try: - # Create DocumentStructure doc_structure = DocumentStructure.from_cvat_xml(xml_path, input_path.name) @@ -1199,6 +1199,9 @@ def convert_cvat_to_docling( # Convert return converter.convert() + except MissingImageInCVATXML: + # Re-raise so that calling code can handle with appropriate messaging + raise except Exception as e: _logger.error(f"Failed to convert CVAT to DoclingDocument: {e}") import traceback diff --git a/docling_eval/cvat_tools/models.py b/docling_eval/cvat_tools/models.py index 6e5d6031..cce71861 100644 --- a/docling_eval/cvat_tools/models.py +++ b/docling_eval/cvat_tools/models.py @@ -15,11 +15,21 @@ class ValidationSeverity(str, Enum): FATAL = "fatal" +class TableStructLabel(str, Enum): + TABLE_ROW = "table_row" + TABLE_COLUMN = "table_column" + TABLE_MERGED_CELL = "table_merged_cell" + COL_HEADER = "col_header" + ROW_HEADER = "row_header" + ROW_SECTION = "row_section" + BODY = "body" + + class CVATElement(BaseModel): """A rectangle element (box) in CVAT annotation, using BoundingBox from docling_core.""" id: int - label: Union[DocItemLabel, GraphCellLabel] + label: Union[DocItemLabel, GraphCellLabel, TableStructLabel] bbox: BoundingBox content_layer: ContentLayer type: Optional[str] = None diff --git a/docling_eval/cvat_tools/parser.py b/docling_eval/cvat_tools/parser.py index 0be616c3..413ccf22 100644 --- a/docling_eval/cvat_tools/parser.py +++ b/docling_eval/cvat_tools/parser.py @@ -12,6 +12,7 @@ CVATAnnotationPath, CVATElement, CVATImageInfo, + TableStructLabel, ) logger = logging.getLogger("docling_eval.cvat_tools.") @@ -76,58 +77,74 @@ def _parse_image_element( Parse a single element and extract elements and paths. Returns (elements, paths, image_info). """ + # Local import to avoid touching the module-level imports if you prefer + from docling_eval.cvat_tools.models import TableStructLabel + image_info = CVATImageInfo( width=float(image_el.attrib["width"]), height=float(image_el.attrib["height"]), name=image_el.attrib["name"], ) - elements = [] - paths = [] + + elements: list[CVATElement] = [] + paths: list[CVATAnnotationPath] = [] box_id = box_id_start path_id = path_id_start + + # ---- parse (rectangles) with strict labels (DocItemLabel | GraphCellLabel | TableStructLabel) for box in image_el.findall("box"): label_str = box.attrib["label"] + + # Parse into one of the known enums; skip if unknown + label_obj: Optional[object] = None try: - label = DocItemLabel(label_str) + label_obj = DocItemLabel(label_str) except ValueError: try: - label = GraphCellLabel(label_str) # type: ignore + label_obj = GraphCellLabel(label_str) # type: ignore[assignment] except ValueError: - # Skip invalid labels - logger.debug(f"Skipping invalid label: {label_str}") - continue + try: + label_obj = TableStructLabel(label_str) # type: ignore[assignment] + except ValueError: + logger.debug(f"Skipping invalid label: {label_str}") + continue + xtl = float(box.attrib["xtl"]) ytl = float(box.attrib["ytl"]) xbr = float(box.attrib["xbr"]) ybr = float(box.attrib["ybr"]) - bbox = cvat_box_to_bbox(xtl, ytl, xbr, ybr) - attributes = {} + bbox = cvat_box_to_bbox(xtl, ytl, xbr, ybr) # -> BoundingBox(l,t,r,b) TOPLEFT + + # Parse child tags; default content_layer to BODY + attributes: dict[str, str | None] = {} content_layer = None - type_ = None - level = None + type_: Optional[str] = None + level: Optional[int] = None + for attr in box.findall("attribute"): name = attr.attrib["name"] value = attr.text.strip() if attr.text else None attributes[name] = value if name == "content_layer" and value is not None: try: - content_layer = ContentLayer(value) + content_layer = ContentLayer(value.lower()) except Exception: content_layer = ContentLayer.BODY elif name == "type": type_ = value - elif name == "level": - if value is not None: - try: - level = int(value) - except Exception: - level = None + elif name == "level" and value is not None: + try: + level = int(value) + except Exception: + level = None + if content_layer is None: content_layer = ContentLayer.BODY + elements.append( CVATElement( id=box_id, - label=label, + label=label_obj, # Union[DocItemLabel, GraphCellLabel, TableStructLabel] bbox=bbox, content_layer=content_layer, type=type_, @@ -136,12 +153,15 @@ def _parse_image_element( ) ) box_id += 1 + + # ---- parse (paths) for poly in image_el.findall("polyline"): poly_label = poly.attrib["label"] points_str = poly.attrib["points"] points = [tuple(map(float, pt.split(","))) for pt in points_str.split(";")] - attributes = {} - level = None + + attributes: dict[str, str | None] = {} # type: ignore + level: Optional[int] = None # type: ignore for attr in poly.findall("attribute"): name = attr.attrib["name"] value = attr.text.strip() if attr.text else None @@ -151,6 +171,7 @@ def _parse_image_element( level = int(value) except Exception: level = None + paths.append( CVATAnnotationPath( id=path_id, @@ -161,6 +182,7 @@ def _parse_image_element( ) ) path_id += 1 + return elements, paths, image_info diff --git a/docling_eval/cvat_tools/validator.py b/docling_eval/cvat_tools/validator.py index a8d3bfe8..c785102c 100644 --- a/docling_eval/cvat_tools/validator.py +++ b/docling_eval/cvat_tools/validator.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Set, Type +from docling_core.types.doc.document import ContentLayer + from .document import DocumentStructure from .models import ( CVATElement, @@ -176,7 +178,7 @@ def validate(self, doc: DocumentStructure) -> List[CVATValidationError]: # Collect all elements that would fail the reading order validation untouched_elements = [] for el in doc.elements: - if el.content_layer.upper() == "BACKGROUND": + if el.content_layer == ContentLayer.BACKGROUND: continue # Skip validation for elements inside table containers