|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import sys |
| 5 | +import re |
| 6 | +import datetime |
| 7 | + |
| 8 | +# PyMuPDF is imported as fitz |
| 9 | +try: |
| 10 | + import fitz |
| 11 | +except ImportError: |
| 12 | + print("FATAL: PyMuPDF library not found.") |
| 13 | + print("--> Please install it by running: pip install pymupdf") |
| 14 | + sys.exit(1) |
| 15 | + |
| 16 | +START_PAGE_INDEX = 5 |
| 17 | +DATE_SEARCH_PAGE_INDEX = 4 |
| 18 | + |
| 19 | +def extract_document_name(doc): |
| 20 | + """Extracts the document identifier (e.g., JEP106BM) from the PDF.""" |
| 21 | + print("[INFO] Extracting document name...") |
| 22 | + # Attempt to find in metadata first |
| 23 | + meta_title = doc.metadata.get('title', '') |
| 24 | + match = re.search(r'(JEP106[A-Z]{2})', meta_title, re.IGNORECASE) |
| 25 | + if match: |
| 26 | + name = match.group(1).upper() |
| 27 | + print(" |-- Found in metadata: '{}'".format(name)) |
| 28 | + return name |
| 29 | + |
| 30 | + # Fallback to scanning the cover page |
| 31 | + try: |
| 32 | + print(" |-- Not in metadata, scanning cover page...") |
| 33 | + cover_page_text = doc.load_page(0).get_text("text") |
| 34 | + match = re.search(r'(JEP106[A-Z]{2})', cover_page_text, re.IGNORECASE) |
| 35 | + if match: |
| 36 | + name = match.group(1).upper() |
| 37 | + print(" |-- Found on cover page: '{}'".format(name)) |
| 38 | + return name |
| 39 | + except Exception as e: |
| 40 | + print(" |-- [WARN] Error scanning cover page: {}".format(e)) |
| 41 | + |
| 42 | + print(" |-- [WARN] Could not find name. Using default 'JEP106BM'.") |
| 43 | + return "JEP106BM" |
| 44 | + |
| 45 | +def extract_document_date(doc): |
| 46 | + """Scans a specific page of the PDF to find the document's effective date.""" |
| 47 | + print("[INFO] Extracting document date from page {}...".format(DATE_SEARCH_PAGE_INDEX + 1)) |
| 48 | + try: |
| 49 | + page = doc.load_page(DATE_SEARCH_PAGE_INDEX) |
| 50 | + text = page.get_text("text") |
| 51 | + |
| 52 | + match = re.search(r'The present list is complete as of\s+(.*?)\.', text, re.IGNORECASE) |
| 53 | + if match: |
| 54 | + date_str = match.group(1).strip() |
| 55 | + dt_obj = datetime.datetime.strptime(date_str, "%B %d, %Y") |
| 56 | + formatted_date = dt_obj.strftime("%Y.%m.%d") |
| 57 | + print(" |-- Found and parsed date: '{}'".format(formatted_date)) |
| 58 | + return formatted_date |
| 59 | + except Exception as e: |
| 60 | + print(" |-- [WARN] Failed to parse date: {}".format(e)) |
| 61 | + |
| 62 | + print(" |-- [WARN] Could not extract date. Using current date as fallback.") |
| 63 | + return datetime.date.today().strftime("%Y.%m.%d") |
| 64 | + |
| 65 | +def clean_manufacturer_name(raw_name): |
| 66 | + """ |
| 67 | + Cleans the raw manufacturer name by stripping trailing table data and |
| 68 | + normalizing internal whitespace. |
| 69 | + """ |
| 70 | + # " 1 1 0 0 0 1 1 1 C7" |
| 71 | + cleaned_name = re.sub(r'(\s+[01])+(\s+[0-9A-Fa-f]{2})?$', '', raw_name) |
| 72 | + |
| 73 | + # Replace sequences of one or more whitespace characters with a single space. |
| 74 | + cleaned_name = re.sub(r'\s+', ' ', cleaned_name) |
| 75 | + |
| 76 | + # Handle non-ASCII punctuation |
| 77 | + replacements = { |
| 78 | + '\u2019': "'", # Right Single Quotation Mark -> Apostrophe |
| 79 | + '\u2018': "'", # Left Single Quotation Mark -> Apostrophe |
| 80 | + '\u201d': '"', # Right Double Quotation Mark -> Quotation Mark |
| 81 | + '\u201c': '"', # Left Double Quotation Mark -> Quotation Mark |
| 82 | + '\u2014': '-', # Em Dash -> Hyphen |
| 83 | + '\u2013': '-', # En Dash -> Hyphen |
| 84 | + } |
| 85 | + |
| 86 | + for old, new in replacements.items(): |
| 87 | + cleaned_name = cleaned_name.replace(old, new) |
| 88 | + |
| 89 | + return cleaned_name.strip() |
| 90 | + |
| 91 | +def parse_jep106_pdf(input_path, output_path): |
| 92 | + """ |
| 93 | + Parses the JEP106 PDF file. |
| 94 | + """ |
| 95 | + print("--- JEP106 Parser Started ---") |
| 96 | + print("[INFO] Input PDF: {}".format(input_path)) |
| 97 | + print("[INFO] Output file: {}".format(output_path)) |
| 98 | + |
| 99 | + try: |
| 100 | + doc = fitz.open(input_path) |
| 101 | + print("[OK] PDF file opened successfully ({} pages).".format(len(doc))) |
| 102 | + except Exception as e: |
| 103 | + print("FATAL: Failed to open or read the PDF file '{}'.".format(input_path)) |
| 104 | + print("--> Error: {}".format(e)) |
| 105 | + return |
| 106 | + |
| 107 | + output_lines = [] |
| 108 | + output_lines.append('') |
| 109 | + |
| 110 | + # Header generation |
| 111 | + output_lines.append("# {}".format(extract_document_name(doc))) |
| 112 | + output_lines.append("# Version: {}".format(extract_document_date(doc))) |
| 113 | + print("[OK] File header generated.") |
| 114 | + |
| 115 | + current_bank = 0 |
| 116 | + manufacturer_count = 0 |
| 117 | + line_pattern = re.compile(r'^(\d{1,3})\s+(.*)') |
| 118 | + print("\n--- Starting Parsing ---") |
| 119 | + |
| 120 | + for page_num in range(START_PAGE_INDEX, len(doc)): |
| 121 | + print("\n[PAGE {}/{}]".format(page_num + 1, len(doc))) |
| 122 | + page = doc.load_page(page_num) |
| 123 | + text = page.get_text("text") |
| 124 | + |
| 125 | + # Check for the start of the appendix to stop parsing. |
| 126 | + if "Annex A (informative) Name Changes" in text: |
| 127 | + print(" [STOP] Detected start of Annex A. Terminating main content parsing.") |
| 128 | + break |
| 129 | + |
| 130 | + lines = text.split('\n') |
| 131 | + |
| 132 | + if page_num == START_PAGE_INDEX and current_bank == 0: |
| 133 | + current_bank = 1 |
| 134 | + output_lines.append(str(current_bank)) |
| 135 | + print(" -> Initialized to Bank {}.".format(current_bank)) |
| 136 | + |
| 137 | + # Check for bank switch text before parsing lines to correctly associate all entries |
| 138 | + if "The following numbers are all in bank" in text: |
| 139 | + current_bank += 1 |
| 140 | + output_lines.append(str(current_bank)) |
| 141 | + print(" -> Detected switch to Bank {}.".format(current_bank)) |
| 142 | + |
| 143 | + i = 0 |
| 144 | + while i < len(lines): |
| 145 | + line = lines[i] |
| 146 | + line_stripped = line.strip() |
| 147 | + |
| 148 | + # Immediately prepare for the next iteration. |
| 149 | + i += 1 |
| 150 | + |
| 151 | + if not line_stripped: |
| 152 | + continue |
| 153 | + |
| 154 | + match = line_pattern.match(line_stripped) |
| 155 | + if match: |
| 156 | + id_code, raw_name = match.groups() |
| 157 | + |
| 158 | + # Look ahead to the next line for a possible continuation. |
| 159 | + # A continuation line is a non-empty line that does NOT start with another ID. |
| 160 | + if i < len(lines): # Check if a next line exists. |
| 161 | + next_line_stripped = lines[i].strip() |
| 162 | + # Use regex to check if the next line is a continuation. |
| 163 | + if next_line_stripped and not line_pattern.match(next_line_stripped): |
| 164 | + # It's a continuation. Append it to the raw name. |
| 165 | + raw_name = f"{raw_name} {next_line_stripped}" |
| 166 | + # We have consumed the next line, so advance the index again. |
| 167 | + i += 1 |
| 168 | + |
| 169 | + # Skip entries that are not actual manufacturers |
| 170 | + if "Continuation Code" in raw_name: |
| 171 | + print(" [SKIP] 'Continuation Code' entry.") |
| 172 | + continue |
| 173 | + |
| 174 | + final_name = clean_manufacturer_name(raw_name) |
| 175 | + output_lines.append("\t{} {}".format(id_code, final_name)) |
| 176 | + manufacturer_count += 1 |
| 177 | + |
| 178 | + print(" [OK] ID: {:<4} Name: {}".format(id_code, final_name)) |
| 179 | + |
| 180 | + print("\n--- Parsing Complete ---") |
| 181 | + print("[INFO] Total manufacturers found: {}".format(manufacturer_count)) |
| 182 | + print("[INFO] Total banks processed: {}".format(current_bank)) |
| 183 | + print("[INFO] Writing {} final lines to '{}'...".format(len(output_lines), output_path)) |
| 184 | + |
| 185 | + try: |
| 186 | + with open(output_path, 'w', encoding='ascii', errors='strict') as f: |
| 187 | + f.write('\n'.join(output_lines)) |
| 188 | + f.write('\n') |
| 189 | + print("\n--- SUCCESS ---") |
| 190 | + print("Generated file: '{}'".format(output_path)) |
| 191 | + except Exception as e: |
| 192 | + print("\n--- FATAL ERROR ---") |
| 193 | + print("Failed to write to the output file '{}'.".format(output_path)) |
| 194 | + print("--> Character Encoding Error or I/O issue.") |
| 195 | + print("--> Detailed Error: {}".format(e)) |
| 196 | + |
| 197 | +if __name__ == '__main__': |
| 198 | + if len(sys.argv) != 3: |
| 199 | + print("Usage: python {} <input_pdf_file> <output_ids_file>".format(sys.argv[0])) |
| 200 | + sys.exit(1) |
| 201 | + |
| 202 | + parse_jep106_pdf(sys.argv[1], sys.argv[2]) |
0 commit comments