freelawproject · Luis-manzur · Aug 20, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,6 +16,7 @@ The following changes are not yet released, but are code complete:
 
 Features:
 - implement `neb` and `nebctapp` extract_from_text #1549
+- Add new flag `--ocr-available` to `sample_caller` to tell doctor that OCR is available #1552
 
 Changes:
 -

diff --git a/juriscraper/lib/ocr_utils.py b/juriscraper/lib/ocr_utils.py
@@ -0,0 +1,137 @@
+import re
+
+from juriscraper.lib.log_tools import make_default_logger
+
+logger = make_default_logger()
+
+PAGINATION_RE = re.compile(r"\b(?:Page|Pg)\s+\d+\s+of\s+\d+\b", re.I)
+PAGINATION_COLON_RE = re.compile(r"\bPage:\s*\d+\b", re.I)
+PAGINATION_PAGE_ID_RE = re.compile(r"\bPageID\s+#:\s*\d+\b", re.I)
+PAGINATION_NUMBER_DASH_RE = re.compile(r"- (\d+) -")
+
+
+def is_page_line(line: str) -> bool:
+    """Detect if a line is a page-number marker.
+
+    :param line: A single textual line extracted from a PDF.
+    :return: True if the line matches "Page X of Y" or "Page: X"; False otherwise.
+    """
+    return bool(
+        PAGINATION_RE.search(line.strip())
+        or PAGINATION_COLON_RE.search(line.strip())
+        or PAGINATION_PAGE_ID_RE.search(line.strip())
+    )
+
+
+def is_doc_common_header(line: str) -> bool:
+    """Identify common header/footer lines that should be ignored.
+
+    :param line: A line extracted from a PDF.
+    :return: True if the line is empty, begins with common header starters, or
+    matches pagination, filing, date/time, or "Received" patterns. False otherwise.
+    """
+    bad_starters = (
+        "Appellate",
+        "Appeal",
+        "Case",
+        "Desc",
+        "Document",
+        "Entered",
+        "Main Document",
+        "Page",
+        "Received:",
+        "USCA",
+    )
+    doc_filed_re = re.compile(r"\b(Filed|Date Filed)\b")
+    date_re = re.compile(r"\b\d{2}/\d{2}/\d{2}\b")
+    time_re = re.compile(r"\b\d{2}:\d{2}:\d{2}\b")
+    received_re = re.compile(r"\bReceived:\s*\d{2}/\d{2}/\d{2}(?:\d{2})?\b")
+
+    if not line:
+        return True
+    if line.startswith(bad_starters):
+        return True
+    return bool(
+        PAGINATION_RE.search(line)
+        or PAGINATION_COLON_RE.search(line)
+        or doc_filed_re.search(line)
+        or date_re.search(line)
+        or time_re.search(line)
+        or received_re.search(line)
+    )
+
+
+def needs_ocr(content, page_count=0, line_count_threshold=5):
+    """Determines if OCR is needed for a PDF (PACER-aware).
+
+    Checks for valid content lines between pages using PACER-style headers.
+    Falls back to missing-pages logic if no page lines are found.
+
+    :param content: The content of a PDF.
+    :param page_count: The expected number of pages in the PDF.
+    :param line_count_threshold: Minimum non-header lines per page.
+    :return: boolean indicating if OCR is needed.
+    """
+    lines = (ln.strip() for ln in content.splitlines())
+    in_page = False
+    other_content_count = 0
+    saw_any_page = False
+    for line in lines:
+        if is_page_line(line):
+            if in_page and other_content_count < line_count_threshold:
+                logger.info(
+                    f"Page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
+                )
+                return True
+            in_page = True
+            saw_any_page = True
+            other_content_count = 0
+            continue
+
+        if not in_page:
+            continue
+
+        # inside a page, count only non-common header lines
+        if not is_doc_common_header(line):
+            other_content_count += 1
+
+    # end of document, close the trailing page
+    if in_page and other_content_count < line_count_threshold:
+        logger.info(
+            f"Trailing page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
+        )
+        return True
+
+    # If no pages were found, fall back to the regular behavior of checking whether
+    # any content remains after removing common headers.
+    if not saw_any_page:
+        # Fallback: original missing-pages logic
+        page_patterns = [
+            r"Page\s+(\d+)",
+            r"- (\d+) -",
+            r"\[(\d+)\]",
+            r"(\d+)\s*$",
+        ]
+        found_pages = set()
+        for pattern in page_patterns:
+            matches = re.findall(pattern, content, re.MULTILINE)
+            for match in matches:
+                try:
+                    page_num = int(match)
+                    if 1 <= page_num <= page_count:
+                        found_pages.add(page_num)
+                except ValueError:
+                    continue
+        missing_pages = set(range(1, page_count + 1)) - found_pages
+        if len(missing_pages) > 2:
+            logger.info(
+                f"Missing pages: {sorted(missing_pages)} out of expected {page_count}"
+            )
+            return True
+        # If any non-header line exists, OCR is not needed
+        for line in content.splitlines():
+            if not is_doc_common_header(line.strip()):
+                return False
+        return True
+
+    return False
diff --git a/sample_caller.py b/sample_caller.py
@@ -16,6 +16,7 @@
 
 from juriscraper.lib.importer import build_module_list, site_yielder
 from juriscraper.lib.log_tools import make_default_logger
+from juriscraper.lib.ocr_utils import needs_ocr
 from juriscraper.lib.string_utils import trunc
 
 logger = make_default_logger()
@@ -86,8 +87,20 @@ def log_dict(dic: dict) -> None:
         logger.debug('    %s: "%s"', k, v)
 
 
+def extract_content_from_doctor(url, files, ocr_available=False):
+    params = {"ocr_available": True} if ocr_available else None
+    response = requests.post(url, files=files, timeout=120, params=params)
+    response.raise_for_status()
+    return response.json()["content"], response.json()["page_count"]
+
+
 def extract_doc_content(
-    data, extract_from_text: bool, site, doctor_host: str, filename: str
+    data,
+    extract_from_text: bool,
+    site,
+    doctor_host: str,
+    filename: str,
+    ocr_available: bool = False,
 ):
     """Extracts document's content using a local doctor host
 
@@ -101,6 +114,7 @@ def extract_doc_content(
     :param doctor_host: local doctor instance host. calls will fail if
         the doctor host is not valid
     :param filename: Name for saving extracted content into a file in tmp
+    :ocr_available: if True, it will tell doctor that OCR is available
 
     :return: a tuple with:
         the extracted content
@@ -119,9 +133,13 @@ def extract_doc_content(
 
     files = {"file": (f"something.{extension}", data)}
     url = MICROSERVICE_URLS["document-extract"].format(doctor_host)
-    extraction__response = requests.post(url, files=files, timeout=120)
-    extraction__response.raise_for_status()
-    extracted_content = extraction__response.json()["content"]
+    extracted_content, page_count = extract_content_from_doctor(url, files)
+
+    if ocr_available and needs_ocr(extracted_content, page_count):
+        logger.info("OCR is needed for this document. Using OCR doctor.")
+        extracted_content, page_count = extract_content_from_doctor(
+            url, files, ocr_available=True
+        )
 
     # The extracted content is embedded for display in Courtlistener.
     # We save it into /tmp/ to have an idea how it would look. You can
@@ -246,6 +264,7 @@ def scrape_court(
     doctor_host="",
     test_hashes: bool = False,
     limit: int = 1000,
+    ocr_available: bool = False,
 ):
     """Calls the requested court(s), gets its binary content, and
     extracts the content if possible. See --extract-content option
@@ -298,7 +317,7 @@ def scrape_court(
         filename = item["case_names"].lower().replace(" ", "_")[:40]
 
         data, metadata_from_text = extract_doc_content(
-            data, extract_content, site, doctor_host, filename
+            data, extract_content, site, doctor_host, filename, ocr_available
         )
         logger.log(
             5, "\nShowing extracted document data (500 chars):\n%s", data[:500]
@@ -486,6 +505,12 @@ def main():
         default=1000,
         help="How many items to scrape per `scrape_court` call",
     )
+    parser.add_option(
+        "--ocr-available",
+        action="store_true",
+        default=False,
+        help="If set it will tell doctor that OCR is available. ",
+    )
 
     (options, args) = parser.parse_args()
 
@@ -501,6 +526,7 @@ def main():
     save_responses = options.save_responses
     test_hashes = options.test_hashes
     limit_per_scrape = options.limit_per_scrape
+    ocr_available = options.ocr_available
 
     if test_hashes:
         binaries = True
@@ -572,6 +598,7 @@ def main():
                     doctor_host,
                     test_hashes,
                     limit_per_scrape,
+                    ocr_available,
                 )
 
     logger.debug("The scraper has stopped.")