From 17d7dc444cfc94acd0d9e0084e031775cd04ccb7 Mon Sep 17 00:00:00 2001 From: elena-kalinina Date: Wed, 15 Oct 2025 11:47:39 +0200 Subject: [PATCH] fix: check for presigned url In my previous commit, I fixed the regex that did not capture s3 bucket url structure and failed to distinguish presigned urls. However, I realized that just fixing the regex is not enough as now it does not distinguish between public and presigned s3 buckets. so I introduced an improved check that only filters presigned buckets. --- .../document_loaders/pdf.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 6b51e481e..99cd3ef00 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -164,8 +164,23 @@ def _is_s3_url(url: str) -> bool: def _is_s3_presigned_url(url: str) -> bool: """Check if the url is a presigned S3 url.""" try: - result = urlparse(url) - return bool(re.search(r"\.s3\.amazonaws\.com$", result.netloc)) + # Parse the URL into its components + parsed_url = urlparse(url) + + # Check if the domain (netloc) matches the S3 pattern. + s3_domain_pattern = r"(?:^|.*\.)s3(?:\.[\w-]+)?\.amazonaws\.com$" + is_s3_domain = bool(re.search(s3_domain_pattern, parsed_url.netloc)) + + if not is_s3_domain: + return False + + # Parse the query string into a dictionary + query_params = parse_qs(parsed_url.query) + + # Check if the signature key exists in the query parameters. + # This is the definitive test for a presigned URL. + return 'X-Amz-Signature' in query_params + except ValueError: return False