@@ -248,21 +248,6 @@ def __call__(
248248 )
249249 visualizations_output_path .mkdir (parents = True , exist_ok = True )
250250
251- document_evaluations_map : Dict [str , DocumentEvaluationEntry ] = {}
252- if ocr_evaluation_report_path and ocr_evaluation_report_path .exists ():
253- with open (ocr_evaluation_report_path , "r" ) as report_file :
254- report_content : Dict [str , Any ] = json .load (report_file )
255- for eval_item_data in report_content .get ("evaluations" , []):
256- try :
257- doc_entry = DocumentEvaluationEntry .model_validate (
258- eval_item_data
259- )
260- document_evaluations_map [doc_entry .doc_id ] = doc_entry
261- except Exception as e_parse :
262- _log .warning (
263- f"Failed to parse document evaluation item: { eval_item_data } . Error: { e_parse } "
264- )
265-
266251 path_to_parquet_files : str = str (dataset_path / data_split_name / "*.parquet" )
267252 hf_dataset : Dataset = load_dataset (
268253 "parquet" , data_files = {data_split_name : path_to_parquet_files }
@@ -283,20 +268,6 @@ def __call__(
283268 BenchMarkColumns .GROUNDTRUTH_PAGE_IMAGES
284269 )
285270
286- page_image_bytes_list : List [Dict [str , bytes ]] = []
287- if isinstance (page_images_data , list ) and page_images_data :
288- if (
289- isinstance (page_images_data [0 ], dict )
290- and "bytes" in page_images_data [0 ]
291- ):
292- page_image_bytes_list = page_images_data
293-
294- if (
295- ocr_evaluation_report_path
296- and doc_id_val not in document_evaluations_map
297- ):
298- continue
299-
300271 ground_truth_segmented_pages : Dict [int , SegmentedPage ] = {}
301272 prediction_segmented_pages : Dict [int , SegmentedPage ] = {}
302273
@@ -316,16 +287,9 @@ def __call__(
316287 if parsed_pred_pages :
317288 prediction_segmented_pages = parsed_pred_pages
318289
319- if not page_image_bytes_list :
320- _log .warning (
321- f"No page images found for document { doc_id_val } . Skipping visualization."
322- )
323- continue
324-
325- image_raw_bytes : bytes = page_image_bytes_list [0 ]["bytes" ]
326- base_image : Image .Image = Image .open (BytesIO (image_raw_bytes )).convert (
327- "RGB"
328- )
290+ base_image : Image .Image = page_images_data [0 ]
291+ if base_image .mode != "RGB" :
292+ base_image = base_image .convert ("RGB" )
329293
330294 comparison_image : Image .Image = self ._render_ocr_comparison_on_image (
331295 doc_id_val ,
0 commit comments