Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions projects/pgai/pgai/vectorizer/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
from io import BytesIO
from typing import Any, Literal

import structlog
from filetype import filetype # type: ignore
from pydantic import BaseModel

logger = structlog.get_logger()


@dataclass
class LoadedDocument:
Expand Down Expand Up @@ -56,6 +59,7 @@ def load(self, row: dict[str, str]) -> LoadedDocument:
file_path = row[self.column_name]

transport_params = None
s3_resource = None
if file_path.startswith("s3://") and self.aws_role_arn is not None:
external_id = os.getenv("AWS_ASSUME_ROLE_EXTERNAL_ID")
sts_client: STSClient = boto3.client("sts") # type: ignore
Expand All @@ -80,11 +84,17 @@ def load(self, row: dict[str, str]) -> LoadedDocument:
# Create an S3 client using the session with assumed role
s3_client: S3Client = session.client("s3") # type: ignore
transport_params = {"client": s3_client}
content = BytesIO(
smart_open.open( # type: ignore
file_path, "rb", transport_params=transport_params
).read()
)
s3_resource = session.resource("s3") # type: ignore
if file_path.startswith("s3://") and not self.aws_role_arn:
import boto3

s3_resource = boto3.resource("s3") # type: ignore
Comment on lines +88 to +91
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the purpose of this change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to log the file size before actually downloading the file, you need a low to access the s3_resource directly.

This was just meant as a debugging thing to log file sizes to cloudwatch before the lambda crashed due to OOM. It's very ugly, and I don't like it, but it's the way I found.

We can just close this PR.

file = smart_open.open(file_path, "rb", transport_params=transport_params) # type: ignore
if s3_resource is not None:
size = file.to_boto3(s3_resource).content_length # type: ignore
logger.info(f"Preparing to download file {file_path}, size: {size} bytes")

content = BytesIO(file.read()) # type: ignore
return LoadedDocument(
content=content,
file_path=file_path,
Expand Down
Loading