juriscraper/juriscraper/lib/ocr_utils.py at b73fc00f4ec95a6c22fdfd7e16542d1d8fa4f33c · freelawproject/juriscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re

from juriscraper.lib.log_tools import make_default_logger

logger = make_default_logger()

PAGINATION_RE = re.compile(r"\b(?:Page|Pg)\s+\d+\s+of\s+\d+\b", re.I)
PAGINATION_COLON_RE = re.compile(r"\bPage:\s*\d+\b", re.I)
PAGINATION_PAGE_ID_RE = re.compile(r"\bPageID\s+#:\s*\d+\b", re.I)
PAGINATION_NUMBER_DASH_RE = re.compile(r"- (\d+) -")


def is_page_line(line: str) -> bool:
    """Detect if a line is a page-number marker.

    :param line: A single textual line extracted from a PDF.
    :return: True if the line matches "Page X of Y" or "Page: X"; False otherwise.
    """
    return bool(
        PAGINATION_RE.search(line.strip())
        or PAGINATION_COLON_RE.search(line.strip())
        or PAGINATION_PAGE_ID_RE.search(line.strip())
    )


def is_doc_common_header(line: str) -> bool:
    """Identify common header/footer lines that should be ignored.

    :param line: A line extracted from a PDF.
    :return: True if the line is empty, begins with common header starters, or
    matches pagination, filing, date/time, or "Received" patterns. False otherwise.
    """
    bad_starters = (
        "Appellate",
        "Appeal",
        "Case",
        "Desc",
        "Document",
        "Entered",
        "Main Document",
        "Page",
        "Received:",
        "USCA",
    )
    doc_filed_re = re.compile(r"\b(Filed|Date Filed)\b")
    date_re = re.compile(r"\b\d{2}/\d{2}/\d{2}\b")
    time_re = re.compile(r"\b\d{2}:\d{2}:\d{2}\b")
    received_re = re.compile(r"\bReceived:\s*\d{2}/\d{2}/\d{2}(?:\d{2})?\b")

    if not line:
        return True
    if line.startswith(bad_starters):
        return True
    return bool(
        PAGINATION_RE.search(line)
        or PAGINATION_COLON_RE.search(line)
        or doc_filed_re.search(line)
        or date_re.search(line)
        or time_re.search(line)
        or received_re.search(line)
    )


def needs_ocr(content, page_count=0, line_count_threshold=5):
    """Determines if OCR is needed for a PDF (PACER-aware).

    Checks for valid content lines between pages using PACER-style headers.
    Falls back to missing-pages logic if no page lines are found.

    :param content: The content of a PDF.
    :param page_count: The expected number of pages in the PDF.
    :param line_count_threshold: Minimum non-header lines per page.
    :return: boolean indicating if OCR is needed.
    """
    lines = (ln.strip() for ln in content.splitlines())
    in_page = False
    other_content_count = 0
    saw_any_page = False
    for line in lines:
        if is_page_line(line):
            if in_page and other_content_count < line_count_threshold:
                logger.info(
                    f"Page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
                )
                return True
            in_page = True
            saw_any_page = True
            other_content_count = 0
            continue

        if not in_page:
            continue

        # inside a page, count only non-common header lines
        if not is_doc_common_header(line):
            other_content_count += 1

    # end of document, close the trailing page
    if in_page and other_content_count < line_count_threshold:
        logger.info(
            f"Trailing page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})"
        )
        return True

    # If no pages were found, fall back to the regular behavior of checking whether
    # any content remains after removing common headers.
    if not saw_any_page:
        # Fallback: original missing-pages logic
        page_patterns = [
            r"Page\s+(\d+)",
            r"- (\d+) -",
            r"\[(\d+)\]",
            r"(\d+)\s*$",
        ]
        found_pages = set()
        for pattern in page_patterns:
            matches = re.findall(pattern, content, re.MULTILINE)
            for match in matches:
                try:
                    page_num = int(match)
                    if 1 <= page_num <= page_count:
                        found_pages.add(page_num)
                except ValueError:
                    continue
        missing_pages = set(range(1, page_count + 1)) - found_pages
        if len(missing_pages) > 2:
            logger.info(
                f"Missing pages: {sorted(missing_pages)} out of expected {page_count}"
            )
            return True
        # If any non-header line exists, OCR is not needed
        for line in content.splitlines():
            if not is_doc_common_header(line.strip()):
                return False
        return True

    return False