feat: configure googlevisionapi (#3126)

MthwRobinson · JIAQIA · dlozeve · web-flow · commit 6005abce79f3 · 2024-05-31T18:41:04.000Z
### Summary Includes changes from #3117. Merged into a feature branch to run the full test suite. Original PR description: The Google Vision API allows for [configuration of the API endpoint](https://cloud.google.com/vision/docs/ocr#regionalization), to select if the data should be sent to the US or the EU. This PR adds an environment variable (`GOOGLEVISION_API_ENDPOINT`) to configure it. --------- Co-authored-by: JIAQIA <jqq1716@gmail.com> Co-authored-by: Dimitri Lozeve <dimitri@lozeve.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 ### Features
 
+- **Allow configuration of the Google Vision API endpoint** Add an environment variable to select the Google Vision API in the US or the EU.
+
 ### Fixes
 
 * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized.
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -95,11 +95,14 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         """optimum text height for tesseract OCR"""
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
+    @property
+    def GOOGLEVISION_API_ENDPOINT(self) -> str:
+        """API endpoint to use for Google Vision"""
+        return self._get_string("GOOGLEVISION_API_ENDPOINT", "")
+
     @property
     def OCR_AGENT(self) -> str:
-        """error margin when comparing if a ocr region is within the table element when preparing
-        table tokens
-        """
+        """OCR Agent to use"""
         return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)
 
     @property
diff --git a/unstructured/partition/utils/ocr_models/google_vision_ocr.py b/unstructured/partition/utils/ocr_models/google_vision_ocr.py
@@ -5,6 +5,8 @@
 
 from google.cloud.vision import Image, ImageAnnotatorClient, Paragraph, TextAnnotation
 
+from unstructured.logger import logger, trace_logger
+from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import Source
 from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
 
@@ -18,7 +20,14 @@ class OCRAgentGoogleVision(OCRAgent):
     """OCR service implementation for Google Vision API."""
 
     def __init__(self) -> None:
-        self.client = ImageAnnotatorClient()
+        client_options = {}
+        api_endpoint = env_config.GOOGLEVISION_API_ENDPOINT
+        if api_endpoint:
+            logger.info(f"Using Google Vision OCR with endpoint {api_endpoint}")
+            client_options["api_endpoint"] = api_endpoint
+        else:
+            logger.info("Using Google Vision OCR with default endpoint")
+        self.client = ImageAnnotatorClient(client_options=client_options)
 
     def is_text_sorted(self) -> bool:
         return True
@@ -34,6 +43,7 @@ def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng")
     def get_layout_from_image(
         self, image: PILImage.Image, ocr_languages: str = "eng"
     ) -> list[TextRegion]:
+        trace_logger.detail("Processing entire page OCR with Google Vision API...")
         with BytesIO() as buffer:
             image.save(buffer, format="PNG")
             response = self.client.document_text_detection(image=Image(content=buffer.getvalue()))