Skip to content
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The following changes are not yet released, but are code complete:

Features:
- implement `neb` and `nebctapp` extract_from_text #1549
- Add new flag `--ocr-available` to `sample_caller` to tell doctor that OCR is available #1552

Changes:
-
Expand Down
137 changes: 137 additions & 0 deletions juriscraper/lib/ocr_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import re

from juriscraper.lib.log_tools import make_default_logger

logger = make_default_logger()

PAGINATION_RE = re.compile(r"\b(?:Page|Pg)\s+\d+\s+of\s+\d+\b", re.I)
PAGINATION_COLON_RE = re.compile(r"\bPage:\s*\d+\b", re.I)
PAGINATION_PAGE_ID_RE = re.compile(r"\bPageID\s+#:\s*\d+\b", re.I)
PAGINATION_NUMBER_DASH_RE = re.compile(r"- (\d+) -")


def is_page_line(line: str) -> bool:
"""Detect if a line is a page-number marker.

:param line: A single textual line extracted from a PDF.
:return: True if the line matches "Page X of Y" or "Page: X"; False otherwise.
"""
return bool(
PAGINATION_RE.search(line.strip())
or PAGINATION_COLON_RE.search(line.strip())
or PAGINATION_PAGE_ID_RE.search(line.strip())
)


def is_doc_common_header(line: str) -> bool:
"""Identify common header/footer lines that should be ignored.

:param line: A line extracted from a PDF.
:return: True if the line is empty, begins with common header starters, or
matches pagination, filing, date/time, or "Received" patterns. False otherwise.
"""
bad_starters = (
"Appellate",
"Appeal",
"Case",
"Desc",
"Document",
"Entered",
"Main Document",
"Page",
"Received:",
"USCA",
)
doc_filed_re = re.compile(r"\b(Filed|Date Filed)\b")
date_re = re.compile(r"\b\d{2}/\d{2}/\d{2}\b")
time_re = re.compile(r"\b\d{2}:\d{2}:\d{2}\b")
received_re = re.compile(r"\bReceived:\s*\d{2}/\d{2}/\d{2}(?:\d{2})?\b")

if not line:
return True
if line.startswith(bad_starters):
return True
return bool(
PAGINATION_RE.search(line)
or PAGINATION_COLON_RE.search(line)
or doc_filed_re.search(line)
or date_re.search(line)
or time_re.search(line)
or received_re.search(line)
)


def needs_ocr(content, page_count=0, line_count_threshold=5):
"""Determines if OCR is needed for a PDF (PACER-aware).

Checks for valid content lines between pages using PACER-style headers.
Falls back to missing-pages logic if no page lines are found.

:param content: The content of a PDF.
:param page_count: The expected number of pages in the PDF.
:param line_count_threshold: Minimum non-header lines per page.
:return: boolean indicating if OCR is needed.
"""
lines = (ln.strip() for ln in content.splitlines())
in_page = False
other_content_count = 0
saw_any_page = False
for line in lines:
if is_page_line(line):
if in_page and other_content_count < line_count_threshold:
logger.info(
f"Page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
)
return True
in_page = True
saw_any_page = True
other_content_count = 0
continue

if not in_page:
continue

# inside a page, count only non-common header lines
if not is_doc_common_header(line):
other_content_count += 1

# end of document, close the trailing page
if in_page and other_content_count < line_count_threshold:
logger.info(
f"Trailing page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
)
return True

# If no pages were found, fall back to the regular behavior of checking whether
# any content remains after removing common headers.
if not saw_any_page:
# Fallback: original missing-pages logic
page_patterns = [
r"Page\s+(\d+)",
r"- (\d+) -",
r"\[(\d+)\]",
r"(\d+)\s*$",
]
found_pages = set()
for pattern in page_patterns:
matches = re.findall(pattern, content, re.MULTILINE)
for match in matches:
try:
page_num = int(match)
if 1 <= page_num <= page_count:
found_pages.add(page_num)
except ValueError:
continue
missing_pages = set(range(1, page_count + 1)) - found_pages
if len(missing_pages) > 2:
logger.info(
f"Missing pages: {sorted(missing_pages)} out of expected {page_count}"
)
return True
# If any non-header line exists, OCR is not needed
for line in content.splitlines():
if not is_doc_common_header(line.strip()):
return False
return True

return False
37 changes: 32 additions & 5 deletions sample_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from juriscraper.lib.importer import build_module_list, site_yielder
from juriscraper.lib.log_tools import make_default_logger
from juriscraper.lib.ocr_utils import needs_ocr
from juriscraper.lib.string_utils import trunc

logger = make_default_logger()
Expand Down Expand Up @@ -86,8 +87,20 @@ def log_dict(dic: dict) -> None:
logger.debug(' %s: "%s"', k, v)


def extract_content_from_doctor(url, files, ocr_available=False):
params = {"ocr_available": True} if ocr_available else None
response = requests.post(url, files=files, timeout=120, params=params)
response.raise_for_status()
return response.json()["content"], response.json()["page_count"]


def extract_doc_content(
data, extract_from_text: bool, site, doctor_host: str, filename: str
data,
extract_from_text: bool,
site,
doctor_host: str,
filename: str,
ocr_available: bool = False,
):
"""Extracts document's content using a local doctor host

Expand All @@ -101,6 +114,7 @@ def extract_doc_content(
:param doctor_host: local doctor instance host. calls will fail if
the doctor host is not valid
:param filename: Name for saving extracted content into a file in tmp
:ocr_available: if True, it will tell doctor that OCR is available

:return: a tuple with:
the extracted content
Expand All @@ -119,9 +133,13 @@ def extract_doc_content(

files = {"file": (f"something.{extension}", data)}
url = MICROSERVICE_URLS["document-extract"].format(doctor_host)
extraction__response = requests.post(url, files=files, timeout=120)
extraction__response.raise_for_status()
extracted_content = extraction__response.json()["content"]
extracted_content, page_count = extract_content_from_doctor(url, files)

if ocr_available and needs_ocr(extracted_content, page_count):
logger.info("OCR is needed for this document. Using OCR doctor.")
extracted_content, page_count = extract_content_from_doctor(
url, files, ocr_available=True
)

# The extracted content is embedded for display in Courtlistener.
# We save it into /tmp/ to have an idea how it would look. You can
Expand Down Expand Up @@ -246,6 +264,7 @@ def scrape_court(
doctor_host="",
test_hashes: bool = False,
limit: int = 1000,
ocr_available: bool = False,
):
"""Calls the requested court(s), gets its binary content, and
extracts the content if possible. See --extract-content option
Expand Down Expand Up @@ -298,7 +317,7 @@ def scrape_court(
filename = item["case_names"].lower().replace(" ", "_")[:40]

data, metadata_from_text = extract_doc_content(
data, extract_content, site, doctor_host, filename
data, extract_content, site, doctor_host, filename, ocr_available
)
logger.log(
5, "\nShowing extracted document data (500 chars):\n%s", data[:500]
Expand Down Expand Up @@ -486,6 +505,12 @@ def main():
default=1000,
help="How many items to scrape per `scrape_court` call",
)
parser.add_option(
"--ocr-available",
action="store_true",
default=False,
help="If set it will tell doctor that OCR is available. ",
)

(options, args) = parser.parse_args()

Expand All @@ -501,6 +526,7 @@ def main():
save_responses = options.save_responses
test_hashes = options.test_hashes
limit_per_scrape = options.limit_per_scrape
ocr_available = options.ocr_available

if test_hashes:
binaries = True
Expand Down Expand Up @@ -572,6 +598,7 @@ def main():
doctor_host,
test_hashes,
limit_per_scrape,
ocr_available,
)

logger.debug("The scraper has stopped.")
Expand Down
Loading