|
| 1 | +import json |
| 2 | +import os |
| 3 | +import time |
| 4 | +from datetime import datetime |
| 5 | +from difflib import SequenceMatcher |
| 6 | + |
| 7 | +import nltk |
| 8 | +import pdf2image |
| 9 | + |
| 10 | +from unstructured_inference.inference.layout import ( |
| 11 | + DocumentLayout, |
| 12 | + create_image_output_dir, |
| 13 | + process_file_with_model, |
| 14 | +) |
| 15 | + |
| 16 | +# Download the required resources (run this once) |
| 17 | +nltk.download('punkt') |
| 18 | + |
| 19 | + |
| 20 | +def validate_performance( |
| 21 | + f_name, |
| 22 | + validation_mode, |
| 23 | + is_image_file=False, |
| 24 | +): |
| 25 | + print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}" |
| 26 | + f" - is_image_file: {is_image_file}") |
| 27 | + |
| 28 | + now_dt = datetime.utcnow() |
| 29 | + now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S") |
| 30 | + |
| 31 | + f_path = os.path.join(example_docs_dir, f_name) |
| 32 | + |
| 33 | + image_f_paths = [] |
| 34 | + if validation_mode == "pdf": |
| 35 | + pdf_info = pdf2image.pdfinfo_from_path(f_path) |
| 36 | + n_pages = pdf_info["Pages"] |
| 37 | + elif validation_mode == "image": |
| 38 | + if is_image_file: |
| 39 | + image_f_paths.append(f_path) |
| 40 | + else: |
| 41 | + image_output_dir = create_image_output_dir(f_path) |
| 42 | + images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir) |
| 43 | + image_f_paths = [image.filename for image in images] |
| 44 | + n_pages = len(image_f_paths) |
| 45 | + else: |
| 46 | + n_pages = 0 |
| 47 | + |
| 48 | + processing_result = {} |
| 49 | + for ocr_mode in ["individual_blocks", "entire_page"]: |
| 50 | + start_time = time.time() |
| 51 | + |
| 52 | + if validation_mode == "pdf": |
| 53 | + layout = process_file_with_model( |
| 54 | + f_path, |
| 55 | + model_name=None, |
| 56 | + ocr_mode=ocr_mode, |
| 57 | + ) |
| 58 | + elif validation_mode == "image": |
| 59 | + pages = [] |
| 60 | + for image_f_path in image_f_paths: |
| 61 | + _layout = process_file_with_model( |
| 62 | + image_f_path, |
| 63 | + model_name=None, |
| 64 | + ocr_mode=ocr_mode, |
| 65 | + is_image=True, |
| 66 | + ) |
| 67 | + pages += _layout.pages |
| 68 | + for i, page in enumerate(pages): |
| 69 | + page.number = i + 1 |
| 70 | + layout = DocumentLayout.from_pages(pages) |
| 71 | + else: |
| 72 | + layout = None |
| 73 | + |
| 74 | + infer_time = time.time() - start_time |
| 75 | + |
| 76 | + if layout is None: |
| 77 | + print("Layout is None") |
| 78 | + return |
| 79 | + |
| 80 | + full_text = str(layout) |
| 81 | + page_text = {} |
| 82 | + for page in layout.pages: |
| 83 | + page_text[page.number] = str(page) |
| 84 | + |
| 85 | + processing_result[ocr_mode] = { |
| 86 | + "infer_time": infer_time, |
| 87 | + "full_text": full_text, |
| 88 | + "page_text": page_text, |
| 89 | + } |
| 90 | + |
| 91 | + individual_mode_page_text = processing_result["individual_blocks"]["page_text"] |
| 92 | + entire_mode_page_text = processing_result["individual_blocks"]["page_text"] |
| 93 | + individual_mode_full_text = processing_result["individual_blocks"]["full_text"] |
| 94 | + entire_mode_full_text = processing_result["entire_page"]["full_text"] |
| 95 | + |
| 96 | + compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text) |
| 97 | + |
| 98 | + report = { |
| 99 | + "validation_mode": validation_mode, |
| 100 | + "file_info": { |
| 101 | + "filename": f_name, |
| 102 | + "n_pages": n_pages, |
| 103 | + }, |
| 104 | + "processing_time": { |
| 105 | + "individual_blocks": processing_result["individual_blocks"]["infer_time"], |
| 106 | + "entire_page": processing_result["entire_page"]["infer_time"], |
| 107 | + }, |
| 108 | + "text_similarity": compare_result, |
| 109 | + "extracted_text": { |
| 110 | + "individual_blocks": { |
| 111 | + "page_text": individual_mode_page_text, |
| 112 | + "full_text": individual_mode_full_text, |
| 113 | + }, |
| 114 | + "entire_page": { |
| 115 | + "page_text": entire_mode_page_text, |
| 116 | + "full_text": entire_mode_full_text, |
| 117 | + }, |
| 118 | + }, |
| 119 | + } |
| 120 | + |
| 121 | + write_report(report, now_str, validation_mode) |
| 122 | + |
| 123 | + print("<<< End performance comparison", f_name) |
| 124 | + |
| 125 | + |
| 126 | +def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "): |
| 127 | + # Calculate similarity ratio |
| 128 | + similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio() |
| 129 | + |
| 130 | + print(f"similarity_ratio: {similarity_ratio}") |
| 131 | + |
| 132 | + # Tokenize the text into words |
| 133 | + word_list_individual = nltk.word_tokenize(individual_mode_full_text) |
| 134 | + n_word_list_individual = len(word_list_individual) |
| 135 | + print("n_word_list_in_text_individual:", n_word_list_individual) |
| 136 | + word_sets_individual = set(word_list_individual) |
| 137 | + n_word_sets_individual = len(word_sets_individual) |
| 138 | + print(f"n_word_sets_in_text_individual: {n_word_sets_individual}") |
| 139 | + # print("word_sets_merged:", word_sets_merged) |
| 140 | + |
| 141 | + word_list_entire = nltk.word_tokenize(entire_mode_full_text) |
| 142 | + n_word_list_entire = len(word_list_entire) |
| 143 | + print("n_word_list_individual:", n_word_list_entire) |
| 144 | + word_sets_entire = set(word_list_entire) |
| 145 | + n_word_sets_entire = len(word_sets_entire) |
| 146 | + print(f"n_word_sets_individual: {n_word_sets_entire}") |
| 147 | + # print("word_sets_individual:", word_sets_individual) |
| 148 | + |
| 149 | + # Find unique elements using difference |
| 150 | + print("diff_elements:") |
| 151 | + unique_words_individual = word_sets_individual - word_sets_entire |
| 152 | + unique_words_entire = word_sets_entire - word_sets_individual |
| 153 | + print(f"unique_words_in_text_individual: {unique_words_individual}\n") |
| 154 | + print(f"unique_words_in_text_entire: {unique_words_entire}") |
| 155 | + |
| 156 | + return { |
| 157 | + "similarity_ratio": similarity_ratio, |
| 158 | + "individual_blocks": { |
| 159 | + "n_word_list": n_word_list_individual, |
| 160 | + "n_word_sets": n_word_sets_individual, |
| 161 | + "unique_words": delimiter.join(list(unique_words_individual)), |
| 162 | + }, |
| 163 | + "entire_page": { |
| 164 | + "n_word_list": n_word_list_entire, |
| 165 | + "n_word_sets": n_word_sets_entire, |
| 166 | + "unique_words": delimiter.join(list(unique_words_entire)), |
| 167 | + }, |
| 168 | + } |
| 169 | + |
| 170 | + |
| 171 | +def write_report(report, now_str, validation_mode): |
| 172 | + report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json" |
| 173 | + report_f_path = os.path.join(output_dir, report_f_name) |
| 174 | + with open(report_f_path, "w", encoding="utf-8-sig") as f: |
| 175 | + json.dump(report, f, indent=4) |
| 176 | + |
| 177 | + |
| 178 | +def run(): |
| 179 | + test_files = [ |
| 180 | + {"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False}, |
| 181 | + {"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False}, |
| 182 | + {"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False}, |
| 183 | + {"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False}, |
| 184 | + {"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False}, |
| 185 | + ] |
| 186 | + |
| 187 | + for test_file in test_files: |
| 188 | + f_name = test_file["name"] |
| 189 | + validation_mode = test_file["mode"] |
| 190 | + is_image_file = test_file["is_image_file"] |
| 191 | + |
| 192 | + validate_performance(f_name, validation_mode, is_image_file) |
| 193 | + |
| 194 | + |
| 195 | +if __name__ == '__main__': |
| 196 | + cur_dir = os.getcwd() |
| 197 | + base_dir = os.path.join(cur_dir, os.pardir, os.pardir) |
| 198 | + example_docs_dir = os.path.join(base_dir, "sample-docs") |
| 199 | + |
| 200 | + # folder path to save temporary outputs |
| 201 | + output_dir = os.path.join(cur_dir, "output") |
| 202 | + os.makedirs(output_dir, exist_ok=True) |
| 203 | + |
| 204 | + run() |
0 commit comments