Skip to content

Commit eb43642

Browse files
authored
Merge branch 'main' into pprados/fix_password
2 parents 8d1ac47 + 38eb661 commit eb43642

File tree

11 files changed

+206
-70
lines changed

11 files changed

+206
-70
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ wheels/
2424
pip-wheel-metadata/
2525
share/python-wheels/
2626
*.egg-info/
27+
nltk_data/
2728
.installed.cfg
2829
*.egg
2930
MANIFEST

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
## 0.16.13
2+
3+
### Enhancements
4+
- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
5+
6+
### Features
7+
8+
### Fixes
9+
10+
- **Fix NLTK Download** to use nltk assets in docker image
11+
- removed the ability to automatically download nltk package if missing
12+
113
## 0.16.12
214

315
### Enhancements

Dockerfile

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
1+
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
2+
3+
ARG PYTHON=python3.11
4+
ARG PIP=pip3.11
25

36
USER root
47

@@ -10,18 +13,20 @@ COPY test_unstructured test_unstructured
1013
COPY example-docs example-docs
1114

1215
RUN chown -R notebook-user:notebook-user /app && \
13-
apk add font-ubuntu git && \
14-
fc-cache -fv && \
15-
if [ "$(readlink -f /usr/bin/python3)" != "/usr/bin/python3.11" ]; then \
16-
ln -sf /usr/bin/python3.11 /usr/bin/python3; \
17-
fi
16+
apk add font-ubuntu git && \
17+
fc-cache -fv && \
18+
[ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3
1819

1920
USER notebook-user
2021

21-
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
22-
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
23-
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
24-
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
22+
ENV NLTK_DATA=/home/notebook-user/nltk_data
23+
24+
# Install Python dependencies and download required NLTK packages
25+
RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
26+
mkdir -p ${NLTK_DATA} && \
27+
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
28+
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
29+
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
2530

2631
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
2732
ENV TESSDATA_PREFIX=/usr/local/share/tessdata

test_unstructured/nlp/test_tokenize.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,9 @@
11
from typing import List, Tuple
2-
from unittest.mock import patch
3-
4-
import nltk
52

63
from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
74
from unstructured.nlp import tokenize
85

96

10-
def test_nltk_packages_download_if_not_present():
11-
tokenize._download_nltk_packages_if_not_present.cache_clear()
12-
with patch.object(nltk, "find", side_effect=LookupError):
13-
with patch.object(tokenize, "download_nltk_packages") as mock_download:
14-
tokenize._download_nltk_packages_if_not_present()
15-
16-
mock_download.assert_called_once()
17-
18-
19-
def test_nltk_packages_do_not_download_if():
20-
tokenize._download_nltk_packages_if_not_present.cache_clear()
21-
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
22-
tokenize._download_nltk_packages_if_not_present()
23-
24-
mock_download.assert_not_called()
25-
26-
277
def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
288
pos_tags: List[Tuple[str, str]] = []
299
for token in tokens:

test_unstructured/partition/pdf_image/test_ocr.py

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
import pytest
88
import unstructured_pytesseract
9+
from bs4 import BeautifulSoup, Tag
910
from pdf2image.exceptions import PDFPageCountError
1011
from PIL import Image, UnidentifiedImageError
1112
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
7172

7273
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
7374
monkeypatch.setattr(
74-
unstructured_pytesseract,
75-
"image_to_data",
75+
OCRAgentTesseract,
76+
"image_to_data_with_character_confidence_filter",
7677
lambda *args, **kwargs: pd.DataFrame(
7778
{
7879
"left": [10, 20, 30, 0],
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
445446
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
446447
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
447448
monkeypatch.setattr(
448-
unstructured_pytesseract,
449-
"image_to_data",
449+
OCRAgentTesseract,
450+
"image_to_data_with_character_confidence_filter",
450451
lambda *args, **kwargs: pd.DataFrame(
451452
{
452453
"left": [10, 20, 30, 0],
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
484485
# Check if the final layout contains both original elements and OCR-derived elements
485486
assert all(element in final_layout for element in mock_out_layout)
486487
assert any(element in final_layout for element in ocr_elements)
488+
489+
490+
def _create_hocr_word_span(
491+
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
492+
) -> Tag:
493+
word_span = BeautifulSoup(
494+
f"<span class='ocrx_word' title='"
495+
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
496+
f"; x_wconf 64'></span>",
497+
"html.parser",
498+
).span
499+
for char, x_conf in characters:
500+
char_span = BeautifulSoup(
501+
f"""
502+
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
503+
""", # noqa : E501
504+
"html.parser",
505+
).span
506+
word_span.append(char_span)
507+
return word_span
508+
509+
510+
def test_extract_word_from_hocr():
511+
characters = [
512+
("w", "99.0"),
513+
("o", "98.5"),
514+
("r", "97.5"),
515+
("d", "96.0"),
516+
("!", "50.0"),
517+
("@", "45.0"),
518+
]
519+
word_bbox = (10, 9, 70, 22)
520+
word_span = _create_hocr_word_span(characters, word_bbox)
521+
522+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
523+
assert text == "word!@"
524+
525+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
526+
assert text == "word"
527+
528+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
529+
assert text == "w"
530+
531+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
532+
assert text == ""
533+
534+
535+
def test_hocr_to_dataframe():
536+
characters = [
537+
("w", "99.0"),
538+
("o", "98.5"),
539+
("r", "97.5"),
540+
("d", "96.0"),
541+
("!", "50.0"),
542+
("@", "45.0"),
543+
]
544+
word_bbox = (10, 9, 70, 22)
545+
hocr = str(_create_hocr_word_span(characters, word_bbox))
546+
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
547+
548+
assert df.shape == (1, 5)
549+
assert df["left"].iloc[0] == 10
550+
assert df["top"].iloc[0] == 9
551+
assert df["width"].iloc[0] == 60
552+
assert df["height"].iloc[0] == 13
553+
assert df["text"].iloc[0] == "word"
554+
555+
556+
def test_hocr_to_dataframe_when_no_prediction_empty_df():
557+
df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
558+
559+
assert df.shape == (0, 5)
560+
assert "left" in df.columns
561+
assert "top" in df.columns
562+
assert "width" in df.columns
563+
assert "text" in df.columns
564+
assert "text" in df.columns

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
995995
[
996996
(
997997
PartitionStrategy.HI_RES,
998-
"unstructured_pytesseract.image_to_data",
998+
"unstructured_pytesseract.image_to_pdf_or_hocr",
999999
),
10001000
(
10011001
PartitionStrategy.OCR_ONLY,
1002-
"unstructured_pytesseract.image_to_data",
1002+
"unstructured_pytesseract.image_to_pdf_or_hocr",
10031003
),
10041004
(
10051005
PartitionStrategy.OCR_ONLY,

test_unstructured_ingest/test-ingest-src.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ all_tests=(
4040
'against-api.sh'
4141
'gcs.sh'
4242
'kafka-local.sh'
43-
'onedrive.sh'
44-
'outlook.sh'
43+
#'onedrive.sh'
44+
#'outlook.sh'
4545
'elasticsearch.sh'
4646
'confluence-diff.sh'
4747
'confluence-large.sh'

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.12" # pragma: no cover
1+
__version__ = "0.16.13" # pragma: no cover

unstructured/nlp/tokenize.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def download_nltk_packages():
1818

1919

2020
def check_for_nltk_package(package_name: str, package_category: str) -> bool:
21-
"""Checks to see if the specified NLTK package exists on the file system"""
21+
"""Checks to see if the specified NLTK package exists on the image."""
2222
paths: list[str] = []
2323
for path in nltk.data.path:
2424
if not path.endswith("nltk_data"):
@@ -32,45 +32,22 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
3232
return False
3333

3434

35-
# We cache this because we do not want to attempt
36-
# downloading the packages multiple times
37-
@lru_cache()
38-
def _download_nltk_packages_if_not_present():
39-
"""If required NLTK packages are not available, download them."""
40-
41-
tagger_available = check_for_nltk_package(
42-
package_category="taggers",
43-
package_name="averaged_perceptron_tagger_eng",
44-
)
45-
tokenizer_available = check_for_nltk_package(
46-
package_category="tokenizers", package_name="punkt_tab"
47-
)
48-
49-
if (not tokenizer_available) or (not tagger_available):
50-
download_nltk_packages()
51-
52-
5335
@lru_cache(maxsize=CACHE_MAX_SIZE)
5436
def sent_tokenize(text: str) -> List[str]:
5537
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
56-
_download_nltk_packages_if_not_present()
5738
return _sent_tokenize(text)
5839

5940

6041
@lru_cache(maxsize=CACHE_MAX_SIZE)
6142
def word_tokenize(text: str) -> List[str]:
6243
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
63-
_download_nltk_packages_if_not_present()
6444
return _word_tokenize(text)
6545

6646

6747
@lru_cache(maxsize=CACHE_MAX_SIZE)
6848
def pos_tag(text: str) -> List[Tuple[str, str]]:
6949
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
70-
_download_nltk_packages_if_not_present()
71-
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
72-
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
73-
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
50+
# Splitting into sentences before tokenizing.
7451
sentences = _sent_tokenize(text)
7552
parts_of_speech: list[tuple[str, str]] = []
7653
for sentence in sentences:

unstructured/partition/utils/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
9696
"""optimum text height for tesseract OCR"""
9797
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
9898

99+
@property
100+
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
101+
"""Tesseract predictions with confidence below this threshold are ignored"""
102+
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
103+
99104
@property
100105
def GOOGLEVISION_API_ENDPOINT(self) -> str:
101106
"""API endpoint to use for Google Vision"""

0 commit comments

Comments
 (0)