feat: parition_pdf() add ability to get cid ratio (#2970)

christinestraub · web-flow · commit 0cd07d78f9e0 · 2024-05-04T05:21:27.000Z
This PR adds the ability to get the ratio of `cid` characters in embedded text extracted by `pdfminer`. This PR is the second part of moving `cid` related code from `unstructured-inference` to `unstructured` and works together with Unstructured-IO/unstructured-inference#342.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.7-dev6
+## 0.13.7-dev7
 
 ### Enhancements
 
@@ -7,6 +7,8 @@
 
 ### Features
 
+* **add ability to get ratio of `cid` characters in embedded text extracted by `pdfminer`**.
+
 ### Fixes
 
 * **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -179,6 +179,33 @@ def test_valid_text(text, outcome):
     assert pdf_image_utils.valid_text(text) == outcome
 
 
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("base", 0.0),
+        ("", 0.0),
+        ("(cid:2)", 1.0),
+        ("(cid:1)a", 0.5),
+        ("c(cid:1)ab", 0.25),
+    ],
+)
+def test_cid_ratio(text, expected):
+    assert pdf_image_utils.cid_ratio(text) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("base", False),
+        ("(cid:2)", True),
+        ("(cid:1234567890)", True),
+        ("jkl;(cid:12)asdf", True),
+    ],
+)
+def test_is_cid_present(text, expected):
+    assert pdf_image_utils.is_cid_present(text) == expected
+
+
 def test_pad_bbox():
     bbox = (100, 100, 200, 200)
     padding = (10, 20)  # Horizontal padding 10, Vertical padding 20
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.7-dev6"  # pragma: no cover
+__version__ = "0.13.7-dev7"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -1,5 +1,6 @@
 import base64
 import os
+import re
 import tempfile
 from copy import deepcopy
 from io import BytesIO
@@ -230,6 +231,23 @@ def valid_text(text: str) -> bool:
     return "(cid:" not in text
 
 
+def cid_ratio(text: str) -> float:
+    """Gets ratio of unknown 'cid' characters extracted from text to all characters."""
+    if not is_cid_present(text):
+        return 0.0
+    cid_pattern = r"\(cid\:(\d+)\)"
+    unmatched, n_cid = re.subn(cid_pattern, "", text)
+    total = n_cid + len(unmatched)
+    return n_cid / total
+
+
+def is_cid_present(text: str) -> bool:
+    """Checks if a cid code is present in a text selection."""
+    if len(text) < len("(cid:x)"):
+        return False
+    return text.find("(cid:") != -1
+
+
 def annotate_layout_elements_with_image(
     inferred_page_layout: "PageLayout",
     extracted_page_layout: Optional["PageLayout"],

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.7-dev6" # pragma: no cover`
	`1`	`+__version__ = "0.13.7-dev7" # pragma: no cover`