aws-samples · Belval · Mar 7, 2025 · Mar 7, 2025
diff --git a/textractor/entities/document.py b/textractor/entities/document.py
@@ -39,6 +39,7 @@
 from textractor.data.html_linearization_config import HTMLLinearizationConfig
 from textractor.entities.linearizable import Linearizable
 
+logger = logging.getLogger(__name__)
 
 class Document(SpatialObject, Linearizable):
     """
@@ -389,7 +390,7 @@ def get_words_by_type(self, text_type: TextTypes = TextTypes.PRINTED) -> List[Wo
         :rtype: EntityList[Word]
         """
         if not self.words:
-            logging.warn("Document contains no word entities.")
+            logger.warning("Document contains no word entities.")
             return []
 
         filtered_words = EntityList()
@@ -554,12 +555,12 @@ def get(
                 lowest_similarity = top_n[-1][1]
 
         if not top_n:
-            logging.warning(
+            logger.warning(
                 f"Query key does not match any existing keys in the document.{os.linesep}{self.keys()}"
             )
             return EntityList([])
 
-        logging.info(f"Query key matched {len(top_n)} key-values in the document.")
+        logger.info(f"Query key matched {len(top_n)} key-values in the document.")
 
         return EntityList([value[0] for value in top_n])
 
@@ -586,14 +587,14 @@ def export_kv_to_csv(
         keys = []
         values = []
         if include_kv and not self.key_values:
-            logging.warning("Document does not contain key-values.")
+            logger.warning("Document does not contain key-values.")
         elif include_kv:
             for kv in self.key_values:
                 keys.append(" ".join([w.text for w in kv.key]))
                 values.append(kv.value.get_text())
 
         if include_checkboxes and not self.checkboxes:
-            logging.warning("Document does not contain checkbox elements.")
+            logger.warning("Document does not contain checkbox elements.")
         elif include_checkboxes:
             for kv in self.checkboxes:
                 keys.append(" ".join([w.text for w in kv.key]))
@@ -604,7 +605,7 @@ def export_kv_to_csv(
             for k, v in zip(keys, values):
                 f.write(f"{k}{sep}{v}{os.linesep}")
 
-        logging.info(
+        logger.info(
             f"csv file stored at location {os.path.join(os.getcwd(),filepath)}"
         )
 
@@ -627,7 +628,7 @@ def export_kv_to_txt(
         export_str = ""
         index = 1
         if include_kv and not self.key_values:
-            logging.warning("Document does not contain key-values.")
+            logger.warning("Document does not contain key-values.")
         elif include_kv:
             for kv in self.key_values:
                 export_str += (
@@ -636,15 +637,15 @@ def export_kv_to_txt(
                 index += 1
 
         if include_checkboxes and not self.checkboxes:
-            logging.warning("Document does not contain checkbox elements.")
+            logger.warning("Document does not contain checkbox elements.")
         elif include_checkboxes:
             for kv in self.checkboxes:
                 export_str += f"{index}. {kv.key.__repr__()} : {kv.value.children[0].status.name}{os.linesep}"
                 index += 1
 
         with open(filepath, "w") as text_file:
             text_file.write(export_str)
-        logging.info(
+        logger.info(
             f"txt file stored at location {os.path.join(os.getcwd(),filepath)}"
         )
 
@@ -657,7 +658,7 @@ def export_tables_to_excel(self, filepath):
         :type filepath: str, required
         """
         if not filepath:
-            logging.error("Filepath required to store excel file.")
+            logger.error("Filepath required to store excel file.")
         workbook = xlsxwriter.Workbook(filepath)
         for table in self.tables:
             workbook = table.to_excel(
@@ -671,7 +672,7 @@ def independent_words(self):
         :rtype: EntityList[Word]
         """
         if not self.words:
-            logging.warning("Words have not been assigned to this Document object.")
+            logger.warning("Words have not been assigned to this Document object.")
             return []
 
         else:
@@ -824,7 +825,7 @@ def _get_coords(self, word_1, word_2, direction, page):
         )
 
         if not word_1_objects:
-            logging.warning(f"{word_1} not found in page {page}")
+            logger.warning(f"{word_1} not found in page {page}")
             return -1, -1, -1, -1
         else:
             word_1_obj = word_1_objects[0]
@@ -839,7 +840,7 @@ def _get_coords(self, word_1, word_2, direction, page):
             )
             word_2_objects = [word for word in word_2_objects if word.page == page]
             if not word_2_objects:
-                logging.warning(f"{word_2} not found in page {page}")
+                logger.warning(f"{word_2} not found in page {page}")
                 return -1, -1, -1, -1
             else:
                 word_2_obj = word_2_objects[0]

diff --git a/textractor/entities/expense_field.py b/textractor/entities/expense_field.py
@@ -8,6 +8,7 @@
 from textractor.data.constants import AnalyzeExpenseLineItemFields as AELineItems
 from typing import List, Tuple
 
+logger = logging.getLogger(__name__)
 
 @dataclasses.dataclass
 class ExpenseType:
@@ -257,7 +258,7 @@ def to_pandas(self, include_EXPENSE_ROW=False):
         try:
             from pandas import DataFrame
         except ImportError:
-            logging.info(
+            logger.info(
                 "pandas library is required for exporting tables to DataFrame objects"
             )
             return None

diff --git a/textractor/entities/identity_document.py b/textractor/entities/identity_document.py
@@ -1,12 +1,7 @@
 """The IdentityDocument class is the object representation of an AnalyzeID response. It is similar to a dictionary. Despite its name it does not inherit from Document as the AnalyzeID response does not contains position information."""
 
 import os
-import string
-import logging
-import xlsxwriter
 from typing import List, Dict, Union
-from copy import deepcopy
-from collections import defaultdict
 from textractor.data.constants import AnalyzeIDFields
 from textractor.entities.bbox import SpatialObject
 from textractor.entities.identity_field import IdentityField

diff --git a/textractor/entities/key_value.py b/textractor/entities/key_value.py
@@ -20,6 +20,7 @@
 from textractor.visualizers.entitylist import EntityList
 from textractor.utils.html_utils import add_id_to_html_tag
 
+logger = logging.getLogger(__name__)
 
 class KeyValue(DocumentEntity):
     """
@@ -101,7 +102,7 @@ def key(self):
         :rtype: EntityList[Word]
         """
         if not self._words:
-            logging.info("Key contains no words objects.")
+            logger.info("Key contains no words objects.")
             return []
         return self._words
 
@@ -123,7 +124,7 @@ def value(self) -> Value:
         :rtype: Value
         """
         if self._value is None:
-            logging.warning(
+            logger.warning(
                 "Asked for a value but it was never attributed "
                 "-> make sure to assign value to key with the `kv.value = <Value Object>` property setter"
             )
@@ -193,7 +194,7 @@ def get_words_by_type(self, text_type: str = TextTypes.PRINTED) -> List[Word]:
             )
 
         if not self.words:
-            logging.info("Document contains no word entities.")
+            logger.info("Document contains no word entities.")
             return []
         else:
             return EntityList(
@@ -211,12 +212,12 @@ def is_selected(self) -> bool:
             if len(self.value.children) == 1:
                 return self.value.children[0].is_selected()
             else:
-                logging.info(
+                logger.info(
                     "is_checked() was called on a KeyValue that contains more than one checkbox. Returning first checkbox"
                 )
                 return self.value.children[0].is_selected()
         else:
-            logging.info(
+            logger.info(
                 "is_checked() was called on a KeyValue that does not contain checkboxes. Returning False"
             )
             return False

diff --git a/textractor/entities/page.py b/textractor/entities/page.py
@@ -44,6 +44,7 @@
 from textractor.visualizers.entitylist import EntityList
 from textractor.entities.linearizable import Linearizable
 
+logger = logging.getLogger(__name__)
 
 class Page(SpatialObject, Linearizable):
     """
@@ -435,7 +436,7 @@ def filter_checkboxes(
         :rtype: EntityList[KeyValue]
         """
         if not self.checkboxes:
-            logging.warning(f"This document does not contain checkboxes")
+            logger.warning(f"This document does not contain checkboxes")
             return []
         else:
             if selected and not_selected:
@@ -476,7 +477,7 @@ def get_words_by_type(
             )
 
         if not self.words:
-            logging.warn("Document contains no word entities.")
+            logger.warning("Document contains no word entities.")
             return []
 
         filtered_words = [word for word in self.words if word.text_type == text_type]
@@ -750,11 +751,11 @@ def get(
                 lowest_similarity = top_n[-1][1]
 
         if not top_n:
-            logging.warning(
+            logger.warning(
                 f"Query key does not match any existing keys in the document.{os.linesep}{self.keys()}"
             )
 
-        logging.info(f"Query key matched {len(top_n)} key-values in the document.")
+        logger.info(f"Query key matched {len(top_n)} key-values in the document.")
 
         return EntityList([value[0] for value in top_n])
 
@@ -778,14 +779,14 @@ def export_kv_to_csv(
         keys = []
         values = []
         if include_kv and not self.key_values:
-            logging.warning("Document does not contain key-values.")
+            logger.warning("Document does not contain key-values.")
         elif include_kv:
             for kv in self.key_values:
                 keys.append(kv.key.__repr__())
                 values.append(kv.value.__repr__())
 
         if include_checkboxes and not self.checkboxes:
-            logging.warning("Document does not contain checkbox elements.")
+            logger.warning("Document does not contain checkbox elements.")
         elif include_checkboxes:
             for kv in self.checkboxes:
                 keys.append(kv.key.__repr__())
@@ -796,7 +797,7 @@ def export_kv_to_csv(
             for k, v in zip(keys, values):
                 f.write(f"{k},{v}{os.linesep}")
 
-        logging.info(
+        logger.info(
             f"csv file stored at location {os.path.join(os.getcwd(), filepath)}"
         )
 
@@ -819,7 +820,7 @@ def export_kv_to_txt(
         export_str = []
         index = 1
         if include_kv and not self.key_values:
-            logging.warning("Document does not contain key-values.")
+            logger.warning("Document does not contain key-values.")
         elif include_kv:
             for kv in self.key_values:
                 export_str.append(
@@ -828,7 +829,7 @@ def export_kv_to_txt(
                 index += 1
 
         if include_checkboxes and not self.checkboxes:
-            logging.warning("Document does not contain checkbox elements.")
+            logger.warning("Document does not contain checkbox elements.")
         elif include_checkboxes:
             for kv in self.checkboxes:
                 export_str.append(
@@ -838,7 +839,7 @@ def export_kv_to_txt(
 
         with open(filepath, "w") as text_file:
             text_file.write("".join(export_str))
-        logging.info(
+        logger.info(
             f"txt file stored at location {os.path.join(os.getcwd(),filepath)}"
         )
 
@@ -848,7 +849,7 @@ def independent_words(self) -> EntityList[Word]:
         :rtype: EntityList[Word]
         """
         if not self.words:
-            logging.warning("Words have not been assigned to this Document object.")
+            logger.warning("Words have not been assigned to this Document object.")
             return []
 
         else:
@@ -871,7 +872,7 @@ def export_tables_to_excel(self, filepath):
         :type filepath: str, required
         """
         if not filepath:
-            logging.error("Filepath required to store excel file.")
+            logger.error("Filepath required to store excel file.")
         workbook = xlsxwriter.Workbook(filepath)
         for table in self.tables:
             workbook = table.to_excel(
@@ -1041,7 +1042,7 @@ def _get_coords(self, word_1, word_2, direction):
         )
 
         if not word_1_objects:
-            logging.warning(f"{word_1} not found in page")
+            logger.warning(f"{word_1} not found in page")
             return -1, -1, -1, -1
         else:
             word_1_obj = word_1_objects[0]
@@ -1056,7 +1057,7 @@ def _get_coords(self, word_1, word_2, direction):
             )
 
             if not word_2_objects:
-                logging.warning(f"{word_2} not found in page")
+                logger.warning(f"{word_2} not found in page")
                 return -1, -1, -1, -1
             else:
                 word_2_obj = word_2_objects[0]

diff --git a/textractor/entities/signature.py b/textractor/entities/signature.py
@@ -6,8 +6,6 @@
 bounding box information, page number, Page ID and confidence of detection.
 """
 
-import logging
-from typing import List
 import uuid
 from textractor.data.text_linearization_config import TextLinearizationConfig