Merge branch 'main' into pprados/fix_password

pprados · web-flow · commit eb4364284f34 · 2025-01-14T11:25:51.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
+nltk_data/
 .installed.cfg
 *.egg
 MANIFEST
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+## 0.16.13
+
+### Enhancements
+- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
+
+### Features
+
+### Fixes
+
+- **Fix NLTK Download** to use nltk assets in docker image
+- removed the ability to automatically download nltk package if missing
+
 ## 0.16.12
 
 ### Enhancements
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,7 @@
-FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
+FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
+
+ARG PYTHON=python3.11
+ARG PIP=pip3.11
 
 USER root
 
@@ -10,18 +13,20 @@ COPY test_unstructured test_unstructured
 COPY example-docs example-docs
 
 RUN chown -R notebook-user:notebook-user /app && \
-  apk add font-ubuntu git && \
-  fc-cache -fv && \
-  if [ "$(readlink -f /usr/bin/python3)" != "/usr/bin/python3.11" ]; then \
-        ln -sf /usr/bin/python3.11 /usr/bin/python3; \
-  fi
+    apk add font-ubuntu git && \
+    fc-cache -fv && \
+    [ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3
 
 USER notebook-user
 
-RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
-  python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
-  python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
-  python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
+ENV NLTK_DATA=/home/notebook-user/nltk_data
+
+# Install Python dependencies and download required NLTK packages
+RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
+    mkdir -p ${NLTK_DATA} && \
+    $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
+    $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
+    $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
 
 ENV PATH="${PATH}:/home/notebook-user/.local/bin"
 ENV TESSDATA_PREFIX=/usr/local/share/tessdata
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
@@ -1,29 +1,9 @@
 from typing import List, Tuple
-from unittest.mock import patch
-
-import nltk
 
 from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 from unstructured.nlp import tokenize
 
 
-def test_nltk_packages_download_if_not_present():
-    tokenize._download_nltk_packages_if_not_present.cache_clear()
-    with patch.object(nltk, "find", side_effect=LookupError):
-        with patch.object(tokenize, "download_nltk_packages") as mock_download:
-            tokenize._download_nltk_packages_if_not_present()
-
-    mock_download.assert_called_once()
-
-
-def test_nltk_packages_do_not_download_if():
-    tokenize._download_nltk_packages_if_not_present.cache_clear()
-    with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
-        tokenize._download_nltk_packages_if_not_present()
-
-    mock_download.assert_not_called()
-
-
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
     pos_tags: List[Tuple[str, str]] = []
     for token in tokens:
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
 
 def test_get_ocr_layout_from_image_tesseract(monkeypatch):
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
     monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
     monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     # Check if the final layout contains both original elements and OCR-derived elements
     assert all(element in final_layout for element in mock_out_layout)
     assert any(element in final_layout for element in ocr_elements)
+
+
+def _create_hocr_word_span(
+    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
+) -> Tag:
+    word_span = BeautifulSoup(
+        f"<span class='ocrx_word' title='"
+        f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
+        f"; x_wconf 64'></span>",
+        "html.parser",
+    ).span
+    for char, x_conf in characters:
+        char_span = BeautifulSoup(
+            f"""
+            <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
+            """,  # noqa : E501
+            "html.parser",
+        ).span
+        word_span.append(char_span)
+    return word_span
+
+
+def test_extract_word_from_hocr():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    word_span = _create_hocr_word_span(characters, word_bbox)
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    assert text == "word!@"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    assert text == "word"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    assert text == "w"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    assert text == ""
+
+
+def test_hocr_to_dataframe():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    hocr = str(_create_hocr_word_span(characters, word_bbox))
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+
+    assert df.shape == (1, 5)
+    assert df["left"].iloc[0] == 10
+    assert df["top"].iloc[0] == 9
+    assert df["width"].iloc[0] == 60
+    assert df["height"].iloc[0] == 13
+    assert df["text"].iloc[0] == "word"
+
+
+def test_hocr_to_dataframe_when_no_prediction_empty_df():
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
+
+    assert df.shape == (0, 5)
+    assert "left" in df.columns
+    assert "top" in df.columns
+    assert "width" in df.columns
+    assert "text" in df.columns
+    assert "text" in df.columns
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
     [
         (
             PartitionStrategy.HI_RES,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,
diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh
@@ -40,8 +40,8 @@ all_tests=(
   'against-api.sh'
   'gcs.sh'
   'kafka-local.sh'
-  'onedrive.sh'
-  'outlook.sh'
+  #'onedrive.sh'
+  #'outlook.sh'
   'elasticsearch.sh'
   'confluence-diff.sh'
   'confluence-large.sh'
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12"  # pragma: no cover
+__version__ = "0.16.13"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -18,7 +18,7 @@ def download_nltk_packages():
 
 
 def check_for_nltk_package(package_name: str, package_category: str) -> bool:
-    """Checks to see if the specified NLTK package exists on the file system"""
+    """Checks to see if the specified NLTK package exists on the image."""
     paths: list[str] = []
     for path in nltk.data.path:
         if not path.endswith("nltk_data"):
@@ -32,45 +32,22 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
         return False
 
 
-# We cache this because we do not want to attempt
-# downloading the packages multiple times
-@lru_cache()
-def _download_nltk_packages_if_not_present():
-    """If required NLTK packages are not available, download them."""
-
-    tagger_available = check_for_nltk_package(
-        package_category="taggers",
-        package_name="averaged_perceptron_tagger_eng",
-    )
-    tokenizer_available = check_for_nltk_package(
-        package_category="tokenizers", package_name="punkt_tab"
-    )
-
-    if (not tokenizer_available) or (not tagger_available):
-        download_nltk_packages()
-
-
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def sent_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
     return _sent_tokenize(text)
 
 
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def word_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK word tokenizer with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
     return _word_tokenize(text)
 
 
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def pos_tag(text: str) -> List[Tuple[str, str]]:
     """A wrapper around the NLTK POS tagger with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
-    # NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
-    # situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
-    # for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
+    # Splitting into sentences before tokenizing.
     sentences = _sent_tokenize(text)
     parts_of_speech: list[tuple[str, str]] = []
     for sentence in sentences:
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         """optimum text height for tesseract OCR"""
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
+    @property
+    def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
+        """Tesseract predictions with confidence below this threshold are ignored"""
+        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
+
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:
         """API endpoint to use for Google Vision"""
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py

Original file line number	Diff line number	Diff line change
`@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():`
`995`	`995`	`[`
`996`	`996`	`(`
`997`	`997`	`PartitionStrategy.HI_RES,`
`998`		`- "unstructured_pytesseract.image_to_data",`
	`998`	`+ "unstructured_pytesseract.image_to_pdf_or_hocr",`
`999`	`999`	`),`
`1000`	`1000`	`(`
`1001`	`1001`	`PartitionStrategy.OCR_ONLY,`
`1002`		`- "unstructured_pytesseract.image_to_data",`
	`1002`	`+ "unstructured_pytesseract.image_to_pdf_or_hocr",`
`1003`	`1003`	`),`
`1004`	`1004`	`(`
`1005`	`1005`	`PartitionStrategy.OCR_ONLY,`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.12" # pragma: no cover`
	`1`	`+__version__ = "0.16.13" # pragma: no cover`