Skip to content

Commit ad1933a

Browse files
authored
Fix s3 client instantiation in _get_document_images_from_path
Fix s3 client instantiation in _get_document_images_from_path
2 parents bb54bc6 + 55fc21b commit ad1933a

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

textractor/entities/lazy_document.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(
2727
job_id: str,
2828
api: TextractAPI,
2929
textract_client=None,
30+
s3_client=None,
3031
images=None,
3132
output_config: OutputConfig = None,
3233
):
@@ -38,6 +39,7 @@ def __init__(
3839
self.job_id = job_id
3940
self._api = api
4041
self._textract_client = textract_client
42+
self._s3_client = s3_client
4143
self._document = None
4244
self._images = images
4345
self._output_config = output_config
@@ -104,6 +106,7 @@ def __getattr__(self, __name: str) -> Any:
104106
"job_id",
105107
"_api",
106108
"_textract_client",
109+
"_s3_client",
107110
"_document",
108111
"_images",
109112
"s3_polling_interval",
@@ -116,14 +119,13 @@ def __getattr__(self, __name: str) -> Any:
116119

117120
if self._document is None:
118121
if self._output_config:
119-
s3_client = boto3.client("s3")
120122
start = time.time()
121123
response = None
122124
while not results_exist(
123125
self.job_id,
124126
self._output_config.s3_bucket,
125127
self._output_config.s3_prefix,
126-
s3_client,
128+
self._s3_client,
127129
):
128130
time.sleep(self._s3_polling_interval)
129131
if time.time() - start > self._textract_polling_interval:
@@ -158,7 +160,7 @@ def __getattr__(self, __name: str) -> Any:
158160
response = get_full_json_from_output_config(
159161
self._output_config,
160162
self.job_id,
161-
s3_client,
163+
self._s3_client,
162164
)
163165
else:
164166
if not self._textract_client:

textractor/textractor.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,7 @@ def _get_document_images_from_path(self, filepath: str) -> List[Image.Image]:
127127
bucket = edit_filepath.split("/")[0]
128128
key = edit_filepath[edit_filepath.index("/") + 1 :]
129129

130-
s3_client = (
131-
boto3.session.Session(profile_name=self.profile_name).client("s3")
132-
if self.profile_name is not None
133-
else boto3.session.Session(region_name=self.region_name).client("s3")
134-
)
135-
file_obj = s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
130+
file_obj = self.s3_client.get_object(Bucket=bucket, Key=key).get("Body").read()
136131
if filepath.lower().endswith(".pdf"):
137132
if IS_PDF_RENDERING_ENABLED:
138133
images = rasterize_pdf(file_obj)
@@ -339,6 +334,7 @@ def start_document_text_detection(
339334
response["JobId"],
340335
TextractAPI.DETECT_TEXT,
341336
textract_client=self.textract_client,
337+
s3_client=self.s3_client,
342338
images=images,
343339
)
344340

@@ -582,6 +578,7 @@ def start_document_analysis(
582578
response["JobId"],
583579
TextractAPI.ANALYZE,
584580
textract_client=self.textract_client,
581+
s3_client=self.s3_client,
585582
images=images,
586583
output_config=output_config,
587584
)
@@ -812,6 +809,7 @@ def start_expense_analysis(
812809
response["JobId"],
813810
TextractAPI.EXPENSE,
814811
textract_client=self.textract_client,
812+
s3_client=self.s3_client,
815813
images=images,
816814
)
817815

0 commit comments

Comments
 (0)