File tree Expand file tree Collapse file tree 3 files changed +12
-1
lines changed
Expand file tree Collapse file tree 3 files changed +12
-1
lines changed Original file line number Diff line number Diff line change 11## 0.13.4-dev4
22
33### Enhancements
4+
45* ** Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
56 function are now deterministic and unique at the document level by default. Before, hashes were
67 based only on text; however, they now also take into account the element's sequence number on a
1213
1314### Features
1415
16+ * ** Add a ` PDF_ANNOTATION_THRESHOLD ` environment variable to control the capture of embedded links in ` partition_pdf() ` for ` fast ` strategy** .
1517* ** Add integration with the Google Cloud Vision API** . Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
1618
1719### Fixes
Original file line number Diff line number Diff line change 9090)
9191from unstructured .partition .strategies import determine_pdf_or_image_strategy , validate_strategy
9292from unstructured .partition .text import element_from_text
93+ from unstructured .partition .utils .config import env_config
9394from unstructured .partition .utils .constants import (
9495 SORT_MODE_BASIC ,
9596 SORT_MODE_DONT ,
@@ -705,7 +706,7 @@ def _process_pdfminer_pages(
705706 languages : List [str ],
706707 metadata_last_modified : Optional [str ],
707708 sort_mode : str = SORT_MODE_XY_CUT ,
708- annotation_threshold : Optional [float ] = 0.9 ,
709+ annotation_threshold : Optional [float ] = env_config . PDF_ANNOTATION_THRESHOLD ,
709710 starting_page_number : int = 1 ,
710711 ** kwargs ,
711712):
Original file line number Diff line number Diff line change @@ -109,5 +109,13 @@ def EMBEDDED_IMAGE_SAME_REGION_THRESHOLD(self) -> float:
109109 """threshold to consider the bounding boxes of two embedded images as the same region"""
110110 return self ._get_float ("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD" , 0.6 )
111111
112+ @property
113+ def PDF_ANNOTATION_THRESHOLD (self ) -> float :
114+ """The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
115+ for an annotation to be considered within the element.
116+ """
117+
118+ return self ._get_float ("PDF_ANNOTATION_THRESHOLD" , 0.9 )
119+
112120
113121env_config = ENVConfig ()
You can’t perform that action at this time.
0 commit comments