Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 14 additions & 13 deletions textractor/entities/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from textractor.data.html_linearization_config import HTMLLinearizationConfig
from textractor.entities.linearizable import Linearizable

logger = logging.getLogger(__name__)

class Document(SpatialObject, Linearizable):
"""
Expand Down Expand Up @@ -389,7 +390,7 @@ def get_words_by_type(self, text_type: TextTypes = TextTypes.PRINTED) -> List[Wo
:rtype: EntityList[Word]
"""
if not self.words:
logging.warn("Document contains no word entities.")
logger.warning("Document contains no word entities.")
return []

filtered_words = EntityList()
Expand Down Expand Up @@ -554,12 +555,12 @@ def get(
lowest_similarity = top_n[-1][1]

if not top_n:
logging.warning(
logger.warning(
f"Query key does not match any existing keys in the document.{os.linesep}{self.keys()}"
)
return EntityList([])

logging.info(f"Query key matched {len(top_n)} key-values in the document.")
logger.info(f"Query key matched {len(top_n)} key-values in the document.")

return EntityList([value[0] for value in top_n])

Expand All @@ -586,14 +587,14 @@ def export_kv_to_csv(
keys = []
values = []
if include_kv and not self.key_values:
logging.warning("Document does not contain key-values.")
logger.warning("Document does not contain key-values.")
elif include_kv:
for kv in self.key_values:
keys.append(" ".join([w.text for w in kv.key]))
values.append(kv.value.get_text())

if include_checkboxes and not self.checkboxes:
logging.warning("Document does not contain checkbox elements.")
logger.warning("Document does not contain checkbox elements.")
elif include_checkboxes:
for kv in self.checkboxes:
keys.append(" ".join([w.text for w in kv.key]))
Expand All @@ -604,7 +605,7 @@ def export_kv_to_csv(
for k, v in zip(keys, values):
f.write(f"{k}{sep}{v}{os.linesep}")

logging.info(
logger.info(
f"csv file stored at location {os.path.join(os.getcwd(),filepath)}"
)

Expand All @@ -627,7 +628,7 @@ def export_kv_to_txt(
export_str = ""
index = 1
if include_kv and not self.key_values:
logging.warning("Document does not contain key-values.")
logger.warning("Document does not contain key-values.")
elif include_kv:
for kv in self.key_values:
export_str += (
Expand All @@ -636,15 +637,15 @@ def export_kv_to_txt(
index += 1

if include_checkboxes and not self.checkboxes:
logging.warning("Document does not contain checkbox elements.")
logger.warning("Document does not contain checkbox elements.")
elif include_checkboxes:
for kv in self.checkboxes:
export_str += f"{index}. {kv.key.__repr__()} : {kv.value.children[0].status.name}{os.linesep}"
index += 1

with open(filepath, "w") as text_file:
text_file.write(export_str)
logging.info(
logger.info(
f"txt file stored at location {os.path.join(os.getcwd(),filepath)}"
)

Expand All @@ -657,7 +658,7 @@ def export_tables_to_excel(self, filepath):
:type filepath: str, required
"""
if not filepath:
logging.error("Filepath required to store excel file.")
logger.error("Filepath required to store excel file.")
workbook = xlsxwriter.Workbook(filepath)
for table in self.tables:
workbook = table.to_excel(
Expand All @@ -671,7 +672,7 @@ def independent_words(self):
:rtype: EntityList[Word]
"""
if not self.words:
logging.warning("Words have not been assigned to this Document object.")
logger.warning("Words have not been assigned to this Document object.")
return []

else:
Expand Down Expand Up @@ -824,7 +825,7 @@ def _get_coords(self, word_1, word_2, direction, page):
)

if not word_1_objects:
logging.warning(f"{word_1} not found in page {page}")
logger.warning(f"{word_1} not found in page {page}")
return -1, -1, -1, -1
else:
word_1_obj = word_1_objects[0]
Expand All @@ -839,7 +840,7 @@ def _get_coords(self, word_1, word_2, direction, page):
)
word_2_objects = [word for word in word_2_objects if word.page == page]
if not word_2_objects:
logging.warning(f"{word_2} not found in page {page}")
logger.warning(f"{word_2} not found in page {page}")
return -1, -1, -1, -1
else:
word_2_obj = word_2_objects[0]
Expand Down
3 changes: 2 additions & 1 deletion textractor/entities/expense_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from textractor.data.constants import AnalyzeExpenseLineItemFields as AELineItems
from typing import List, Tuple

logger = logging.getLogger(__name__)

@dataclasses.dataclass
class ExpenseType:
Expand Down Expand Up @@ -257,7 +258,7 @@ def to_pandas(self, include_EXPENSE_ROW=False):
try:
from pandas import DataFrame
except ImportError:
logging.info(
logger.info(
"pandas library is required for exporting tables to DataFrame objects"
)
return None
Expand Down
5 changes: 0 additions & 5 deletions textractor/entities/identity_document.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
"""The IdentityDocument class is the object representation of an AnalyzeID response. It is similar to a dictionary. Despite its name it does not inherit from Document as the AnalyzeID response does not contains position information."""

import os
import string
import logging
import xlsxwriter
from typing import List, Dict, Union
from copy import deepcopy
from collections import defaultdict
from textractor.data.constants import AnalyzeIDFields
from textractor.entities.bbox import SpatialObject
from textractor.entities.identity_field import IdentityField
Expand Down
11 changes: 6 additions & 5 deletions textractor/entities/key_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from textractor.visualizers.entitylist import EntityList
from textractor.utils.html_utils import add_id_to_html_tag

logger = logging.getLogger(__name__)

class KeyValue(DocumentEntity):
"""
Expand Down Expand Up @@ -101,7 +102,7 @@ def key(self):
:rtype: EntityList[Word]
"""
if not self._words:
logging.info("Key contains no words objects.")
logger.info("Key contains no words objects.")
return []
return self._words

Expand All @@ -123,7 +124,7 @@ def value(self) -> Value:
:rtype: Value
"""
if self._value is None:
logging.warning(
logger.warning(
"Asked for a value but it was never attributed "
"-> make sure to assign value to key with the `kv.value = <Value Object>` property setter"
)
Expand Down Expand Up @@ -193,7 +194,7 @@ def get_words_by_type(self, text_type: str = TextTypes.PRINTED) -> List[Word]:
)

if not self.words:
logging.info("Document contains no word entities.")
logger.info("Document contains no word entities.")
return []
else:
return EntityList(
Expand All @@ -211,12 +212,12 @@ def is_selected(self) -> bool:
if len(self.value.children) == 1:
return self.value.children[0].is_selected()
else:
logging.info(
logger.info(
"is_checked() was called on a KeyValue that contains more than one checkbox. Returning first checkbox"
)
return self.value.children[0].is_selected()
else:
logging.info(
logger.info(
"is_checked() was called on a KeyValue that does not contain checkboxes. Returning False"
)
return False
Expand Down
29 changes: 15 additions & 14 deletions textractor/entities/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from textractor.visualizers.entitylist import EntityList
from textractor.entities.linearizable import Linearizable

logger = logging.getLogger(__name__)

class Page(SpatialObject, Linearizable):
"""
Expand Down Expand Up @@ -435,7 +436,7 @@ def filter_checkboxes(
:rtype: EntityList[KeyValue]
"""
if not self.checkboxes:
logging.warning(f"This document does not contain checkboxes")
logger.warning(f"This document does not contain checkboxes")
return []
else:
if selected and not_selected:
Expand Down Expand Up @@ -476,7 +477,7 @@ def get_words_by_type(
)

if not self.words:
logging.warn("Document contains no word entities.")
logger.warning("Document contains no word entities.")
return []

filtered_words = [word for word in self.words if word.text_type == text_type]
Expand Down Expand Up @@ -750,11 +751,11 @@ def get(
lowest_similarity = top_n[-1][1]

if not top_n:
logging.warning(
logger.warning(
f"Query key does not match any existing keys in the document.{os.linesep}{self.keys()}"
)

logging.info(f"Query key matched {len(top_n)} key-values in the document.")
logger.info(f"Query key matched {len(top_n)} key-values in the document.")

return EntityList([value[0] for value in top_n])

Expand All @@ -778,14 +779,14 @@ def export_kv_to_csv(
keys = []
values = []
if include_kv and not self.key_values:
logging.warning("Document does not contain key-values.")
logger.warning("Document does not contain key-values.")
elif include_kv:
for kv in self.key_values:
keys.append(kv.key.__repr__())
values.append(kv.value.__repr__())

if include_checkboxes and not self.checkboxes:
logging.warning("Document does not contain checkbox elements.")
logger.warning("Document does not contain checkbox elements.")
elif include_checkboxes:
for kv in self.checkboxes:
keys.append(kv.key.__repr__())
Expand All @@ -796,7 +797,7 @@ def export_kv_to_csv(
for k, v in zip(keys, values):
f.write(f"{k},{v}{os.linesep}")

logging.info(
logger.info(
f"csv file stored at location {os.path.join(os.getcwd(), filepath)}"
)

Expand All @@ -819,7 +820,7 @@ def export_kv_to_txt(
export_str = []
index = 1
if include_kv and not self.key_values:
logging.warning("Document does not contain key-values.")
logger.warning("Document does not contain key-values.")
elif include_kv:
for kv in self.key_values:
export_str.append(
Expand All @@ -828,7 +829,7 @@ def export_kv_to_txt(
index += 1

if include_checkboxes and not self.checkboxes:
logging.warning("Document does not contain checkbox elements.")
logger.warning("Document does not contain checkbox elements.")
elif include_checkboxes:
for kv in self.checkboxes:
export_str.append(
Expand All @@ -838,7 +839,7 @@ def export_kv_to_txt(

with open(filepath, "w") as text_file:
text_file.write("".join(export_str))
logging.info(
logger.info(
f"txt file stored at location {os.path.join(os.getcwd(),filepath)}"
)

Expand All @@ -848,7 +849,7 @@ def independent_words(self) -> EntityList[Word]:
:rtype: EntityList[Word]
"""
if not self.words:
logging.warning("Words have not been assigned to this Document object.")
logger.warning("Words have not been assigned to this Document object.")
return []

else:
Expand All @@ -871,7 +872,7 @@ def export_tables_to_excel(self, filepath):
:type filepath: str, required
"""
if not filepath:
logging.error("Filepath required to store excel file.")
logger.error("Filepath required to store excel file.")
workbook = xlsxwriter.Workbook(filepath)
for table in self.tables:
workbook = table.to_excel(
Expand Down Expand Up @@ -1041,7 +1042,7 @@ def _get_coords(self, word_1, word_2, direction):
)

if not word_1_objects:
logging.warning(f"{word_1} not found in page")
logger.warning(f"{word_1} not found in page")
return -1, -1, -1, -1
else:
word_1_obj = word_1_objects[0]
Expand All @@ -1056,7 +1057,7 @@ def _get_coords(self, word_1, word_2, direction):
)

if not word_2_objects:
logging.warning(f"{word_2} not found in page")
logger.warning(f"{word_2} not found in page")
return -1, -1, -1, -1
else:
word_2_obj = word_2_objects[0]
Expand Down
2 changes: 0 additions & 2 deletions textractor/entities/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
bounding box information, page number, Page ID and confidence of detection.
"""

import logging
from typing import List
import uuid
from textractor.data.text_linearization_config import TextLinearizationConfig

Expand Down
Loading