-
-
Notifications
You must be signed in to change notification settings - Fork 148
1552 implement needs ocr in sample caller #1554
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Luis-manzur
wants to merge
8
commits into
main
Choose a base branch
from
1552-implement-needs_ocr-in-sample-caller
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 4 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
61e0426
feat(ocr): implement needs_ocr function to determine OCR necessity fo…
Luis-manzur f3e93ce
feat(ocr): enhance needs_ocr function to integrate page count
Luis-manzur c0b8fbc
feat(ocr): update help message for --ocr-available option in sample_c…
Luis-manzur b73fc00
chore: add ocr feat to CHANGES.md
Luis-manzur f91436f
fix(sample_caller): update params handling for ocr_available in extra…
Luis-manzur f2d255b
fix(sample_caller): correct ocr_available parameter handling in extra…
Luis-manzur 5968a7f
Merge remote-tracking branch 'origin/1552-implement-needs_ocr-in-samp…
Luis-manzur 1a9924b
Merge branch 'main' into 1552-implement-needs_ocr-in-sample-caller
Luis-manzur File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import re | ||
|
|
||
| from juriscraper.lib.log_tools import make_default_logger | ||
|
|
||
| logger = make_default_logger() | ||
|
|
||
| PAGINATION_RE = re.compile(r"\b(?:Page|Pg)\s+\d+\s+of\s+\d+\b", re.I) | ||
| PAGINATION_COLON_RE = re.compile(r"\bPage:\s*\d+\b", re.I) | ||
| PAGINATION_PAGE_ID_RE = re.compile(r"\bPageID\s+#:\s*\d+\b", re.I) | ||
| PAGINATION_NUMBER_DASH_RE = re.compile(r"- (\d+) -") | ||
|
|
||
|
|
||
| def is_page_line(line: str) -> bool: | ||
| """Detect if a line is a page-number marker. | ||
|
|
||
| :param line: A single textual line extracted from a PDF. | ||
| :return: True if the line matches "Page X of Y" or "Page: X"; False otherwise. | ||
| """ | ||
| return bool( | ||
| PAGINATION_RE.search(line.strip()) | ||
| or PAGINATION_COLON_RE.search(line.strip()) | ||
| or PAGINATION_PAGE_ID_RE.search(line.strip()) | ||
| ) | ||
|
|
||
|
|
||
| def is_doc_common_header(line: str) -> bool: | ||
| """Identify common header/footer lines that should be ignored. | ||
|
|
||
| :param line: A line extracted from a PDF. | ||
| :return: True if the line is empty, begins with common header starters, or | ||
| matches pagination, filing, date/time, or "Received" patterns. False otherwise. | ||
| """ | ||
| bad_starters = ( | ||
| "Appellate", | ||
| "Appeal", | ||
| "Case", | ||
| "Desc", | ||
| "Document", | ||
| "Entered", | ||
| "Main Document", | ||
| "Page", | ||
| "Received:", | ||
| "USCA", | ||
| ) | ||
| doc_filed_re = re.compile(r"\b(Filed|Date Filed)\b") | ||
| date_re = re.compile(r"\b\d{2}/\d{2}/\d{2}\b") | ||
| time_re = re.compile(r"\b\d{2}:\d{2}:\d{2}\b") | ||
| received_re = re.compile(r"\bReceived:\s*\d{2}/\d{2}/\d{2}(?:\d{2})?\b") | ||
|
|
||
| if not line: | ||
| return True | ||
| if line.startswith(bad_starters): | ||
| return True | ||
| return bool( | ||
| PAGINATION_RE.search(line) | ||
| or PAGINATION_COLON_RE.search(line) | ||
| or doc_filed_re.search(line) | ||
| or date_re.search(line) | ||
| or time_re.search(line) | ||
| or received_re.search(line) | ||
| ) | ||
|
|
||
|
|
||
| def needs_ocr(content, page_count=0, line_count_threshold=5): | ||
| """Determines if OCR is needed for a PDF (PACER-aware). | ||
|
|
||
| Checks for valid content lines between pages using PACER-style headers. | ||
| Falls back to missing-pages logic if no page lines are found. | ||
|
|
||
| :param content: The content of a PDF. | ||
| :param page_count: The expected number of pages in the PDF. | ||
| :param line_count_threshold: Minimum non-header lines per page. | ||
| :return: boolean indicating if OCR is needed. | ||
| """ | ||
| lines = (ln.strip() for ln in content.splitlines()) | ||
| in_page = False | ||
| other_content_count = 0 | ||
| saw_any_page = False | ||
| for line in lines: | ||
| if is_page_line(line): | ||
| if in_page and other_content_count < line_count_threshold: | ||
| logger.info( | ||
| f"Page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})" | ||
| ) | ||
| return True | ||
| in_page = True | ||
| saw_any_page = True | ||
| other_content_count = 0 | ||
| continue | ||
|
|
||
| if not in_page: | ||
| continue | ||
|
|
||
| # inside a page, count only non-common header lines | ||
| if not is_doc_common_header(line): | ||
| other_content_count += 1 | ||
|
|
||
| # end of document, close the trailing page | ||
| if in_page and other_content_count < line_count_threshold: | ||
| logger.info( | ||
| f"Trailing page with insufficient content: {other_content_count} lines (threshold: {line_count_threshold})" | ||
| ) | ||
| return True | ||
|
|
||
| # If no pages were found, fall back to the regular behavior of checking whether | ||
| # any content remains after removing common headers. | ||
| if not saw_any_page: | ||
| # Fallback: original missing-pages logic | ||
| page_patterns = [ | ||
| r"Page\s+(\d+)", | ||
| r"- (\d+) -", | ||
| r"\[(\d+)\]", | ||
| r"(\d+)\s*$", | ||
| ] | ||
| found_pages = set() | ||
| for pattern in page_patterns: | ||
| matches = re.findall(pattern, content, re.MULTILINE) | ||
| for match in matches: | ||
| try: | ||
| page_num = int(match) | ||
| if 1 <= page_num <= page_count: | ||
| found_pages.add(page_num) | ||
| except ValueError: | ||
| continue | ||
| missing_pages = set(range(1, page_count + 1)) - found_pages | ||
| if len(missing_pages) > 2: | ||
| logger.info( | ||
| f"Missing pages: {sorted(missing_pages)} out of expected {page_count}" | ||
| ) | ||
| return True | ||
| # If any non-header line exists, OCR is not needed | ||
| for line in content.splitlines(): | ||
| if not is_doc_common_header(line.strip()): | ||
| return False | ||
| return True | ||
|
|
||
| return False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.