diff --git a/CHANGES.md b/CHANGES.md index 2b491e343..e1131c8e1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -17,6 +17,7 @@ The following changes are not yet released, but are code complete: Features: - add `bap6` scraper, inherits from `ca6` #1544 - implement `neb` and `nebctapp` extract_from_text #1549 +- Add new flag `--ocr-available` to `sample_caller` to tell doctor that OCR is available #1552 Changes: - diff --git a/juriscraper/lib/ocr_utils.py b/juriscraper/lib/ocr_utils.py new file mode 100644 index 000000000..0c7e94989 --- /dev/null +++ b/juriscraper/lib/ocr_utils.py @@ -0,0 +1,137 @@ +import re + +from juriscraper.lib.log_tools import make_default_logger + +logger = make_default_logger() + +PAGINATION_RE = re.compile(r"\b(?:Page|Pg)\s+\d+\s+of\s+\d+\b", re.I) +PAGINATION_COLON_RE = re.compile(r"\bPage:\s*\d+\b", re.I) +PAGINATION_PAGE_ID_RE = re.compile(r"\bPageID\s+#:\s*\d+\b", re.I) +PAGINATION_NUMBER_DASH_RE = re.compile(r"- (\d+) -") + + +def is_page_line(line: str) -> bool: + """Detect if a line is a page-number marker. + + :param line: A single textual line extracted from a PDF. + :return: True if the line matches "Page X of Y" or "Page: X"; False otherwise. + """ + return bool( + PAGINATION_RE.search(line.strip()) + or PAGINATION_COLON_RE.search(line.strip()) + or PAGINATION_PAGE_ID_RE.search(line.strip()) + ) + + +def is_doc_common_header(line: str) -> bool: + """Identify common header/footer lines that should be ignored. + + :param line: A line extracted from a PDF. + :return: True if the line is empty, begins with common header starters, or + matches pagination, filing, date/time, or "Received" patterns. False otherwise. + """ + bad_starters = ( + "Appellate", + "Appeal", + "Case", + "Desc", + "Document", + "Entered", + "Main Document", + "Page", + "Received:", + "USCA", + ) + doc_filed_re = re.compile(r"\b(Filed|Date Filed)\b") + date_re = re.compile(r"\b\d{2}/\d{2}/\d{2}\b") + time_re = re.compile(r"\b\d{2}:\d{2}:\d{2}\b") + received_re = re.compile(r"\bReceived:\s*\d{2}/\d{2}/\d{2}(?:\d{2})?\b") + + if not line: + return True + if line.startswith(bad_starters): + return True + return bool( + PAGINATION_RE.search(line) + or PAGINATION_COLON_RE.search(line) + or doc_filed_re.search(line) + or date_re.search(line) + or time_re.search(line) + or received_re.search(line) + ) + + +def needs_ocr(content, page_count=0, line_count_threshold=5): + """Determines if OCR is needed for a PDF (PACER-aware). + + Checks for valid content lines between pages using PACER-style headers. + Falls back to missing-pages logic if no page lines are found. + + :param content: The content of a PDF. + :param page_count: The expected number of pages in the PDF. + :param line_count_threshold: Minimum non-header lines per page. + :return: boolean indicating if OCR is needed. + """ + lines = (ln.strip() for ln in content.splitlines()) + in_page = False + other_content_count = 0 + saw_any_page = False + for line in lines: + if is_page_line(line): + if in_page and other_content_count < line_count_threshold: + logger.info( + f"Page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})" + ) + return True + in_page = True + saw_any_page = True + other_content_count = 0 + continue + + if not in_page: + continue + + # inside a page, count only non-common header lines + if not is_doc_common_header(line): + other_content_count += 1 + + # end of document, close the trailing page + if in_page and other_content_count < line_count_threshold: + logger.info( + f"Trailing page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})" + ) + return True + + # If no pages were found, fall back to the regular behavior of checking whether + # any content remains after removing common headers. + if not saw_any_page: + # Fallback: original missing-pages logic + page_patterns = [ + r"Page\s+(\d+)", + r"- (\d+) -", + r"\[(\d+)\]", + r"(\d+)\s*$", + ] + found_pages = set() + for pattern in page_patterns: + matches = re.findall(pattern, content, re.MULTILINE) + for match in matches: + try: + page_num = int(match) + if 1 <= page_num <= page_count: + found_pages.add(page_num) + except ValueError: + continue + missing_pages = set(range(1, page_count + 1)) - found_pages + if len(missing_pages) > 2: + logger.info( + f"Missing pages: {sorted(missing_pages)} out of expected {page_count}" + ) + return True + # If any non-header line exists, OCR is not needed + for line in content.splitlines(): + if not is_doc_common_header(line.strip()): + return False + return True + + return False diff --git a/sample_caller.py b/sample_caller.py index e72161d79..232d5f335 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -16,6 +16,7 @@ from juriscraper.lib.importer import build_module_list, site_yielder from juriscraper.lib.log_tools import make_default_logger +from juriscraper.lib.ocr_utils import needs_ocr from juriscraper.lib.string_utils import trunc logger = make_default_logger() @@ -86,8 +87,20 @@ def log_dict(dic: dict) -> None: logger.debug(' %s: "%s"', k, v) +def extract_content_from_doctor(url, files, ocr_available=False): + params = {"ocr_available": ocr_available} + response = requests.post(url, files=files, timeout=120, params=params) + response.raise_for_status() + return response.json()["content"], response.json()["page_count"] + + def extract_doc_content( - data, extract_from_text: bool, site, doctor_host: str, filename: str + data, + extract_from_text: bool, + site, + doctor_host: str, + filename: str, + ocr_available: bool = False, ): """Extracts document's content using a local doctor host @@ -101,6 +114,7 @@ def extract_doc_content( :param doctor_host: local doctor instance host. calls will fail if the doctor host is not valid :param filename: Name for saving extracted content into a file in tmp + :param ocr_available: if True, it will tell doctor that OCR is available :return: a tuple with: the extracted content @@ -119,9 +133,15 @@ def extract_doc_content( files = {"file": (f"something.{extension}", data)} url = MICROSERVICE_URLS["document-extract"].format(doctor_host) - extraction__response = requests.post(url, files=files, timeout=120) - extraction__response.raise_for_status() - extracted_content = extraction__response.json()["content"] + extracted_content, page_count = extract_content_from_doctor( + url, files, ocr_available=False + ) + + if ocr_available and needs_ocr(extracted_content, page_count): + logger.info("OCR is needed for this document. Using OCR doctor.") + extracted_content, page_count = extract_content_from_doctor( + url, files, ocr_available=True + ) # The extracted content is embedded for display in Courtlistener. # We save it into /tmp/ to have an idea how it would look. You can @@ -246,6 +266,7 @@ def scrape_court( doctor_host="", test_hashes: bool = False, limit: int = 1000, + ocr_available: bool = False, ): """Calls the requested court(s), gets its binary content, and extracts the content if possible. See --extract-content option @@ -298,7 +319,7 @@ def scrape_court( filename = item["case_names"].lower().replace(" ", "_")[:40] data, metadata_from_text = extract_doc_content( - data, extract_content, site, doctor_host, filename + data, extract_content, site, doctor_host, filename, ocr_available ) logger.log( 5, "\nShowing extracted document data (500 chars):\n%s", data[:500] @@ -486,6 +507,12 @@ def main(): default=1000, help="How many items to scrape per `scrape_court` call", ) + parser.add_option( + "--ocr-available", + action="store_true", + default=False, + help="If set it will tell doctor that OCR is available. ", + ) (options, args) = parser.parse_args() @@ -501,6 +528,7 @@ def main(): save_responses = options.save_responses test_hashes = options.test_hashes limit_per_scrape = options.limit_per_scrape + ocr_available = options.ocr_available if test_hashes: binaries = True @@ -572,6 +600,7 @@ def main(): doctor_host, test_hashes, limit_per_scrape, + ocr_available, ) logger.debug("The scraper has stopped.")