-
Notifications
You must be signed in to change notification settings - Fork 17
Fixed s3 decoding #1516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: antalya-26.1
Are you sure you want to change the base?
Fixed s3 decoding #1516
Changes from 5 commits
2908d6d
9bcbbe6
312aaa5
7d57785
e8d4d20
140199a
357fbb3
5caa8cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -514,9 +514,20 @@ std::pair<DB::ObjectStoragePtr, std::string> resolveObjectStorageForPath( | |
| normalized_path = "gs://" + target_decomposed.authority + "/" + target_decomposed.key; | ||
| } | ||
| S3::URI s3_uri(normalized_path); | ||
|
|
||
| std::string key_to_use = s3_uri.key; | ||
|
|
||
|
|
||
| // Use key (parsed without URI decoding) so that percent-encoded | ||
| // characters in object keys (e.g. %2F in Iceberg partition paths) are preserved. | ||
| std::string key_to_use = target_decomposed.key; | ||
|
|
||
| // For path-style HTTP(S) URLs, SchemeAuthorityKey puts <bucket>/<key> in .key | ||
| // because the bucket lives in the URL path, not the hostname. | ||
| // Strip the bucket prefix so the key is relative to the bucket. | ||
| if (!s3_uri.is_virtual_hosted_style | ||
| && key_to_use.starts_with(s3_uri.bucket + "/")) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work correctly when path really starts with bucket name? Like
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For this |
||
| { | ||
| key_to_use = key_to_use.substr(s3_uri.bucket.size() + 1); | ||
| } | ||
|
|
||
| bool use_base_storage = false; | ||
| if (base_storage->getType() == ObjectStorageType::S3) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| #include <gtest/gtest.h> | ||
|
|
||
| #include <base/defines.h> | ||
| #include <IO/S3/Credentials.h> | ||
| #include "config.h" | ||
|
|
||
| #if USE_AWS_S3 && USE_AVRO | ||
|
|
||
| #include <Disks/DiskObjectStorage/ObjectStorages/S3/S3ObjectStorage.h> | ||
| #include <IO/S3/Client.h> | ||
| #include <IO/S3/PocoHTTPClient.h> | ||
| #include <IO/S3/URI.h> | ||
| #include <Storages/ObjectStorage/Utils.h> | ||
|
|
||
| using namespace DB; | ||
|
|
||
| namespace | ||
| { | ||
| struct ClientFake : DB::S3::Client | ||
| { | ||
| explicit ClientFake() | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to gtest_readbuffer_s3.cpp |
||
| : DB::S3::Client( | ||
| 1, | ||
| DB::S3::ServerSideEncryptionKMSConfig(), | ||
| std::make_shared<Aws::Auth::SimpleAWSCredentialsProvider>("test_access_key", "test_secret"), | ||
| DB::S3::ClientFactory::instance().createClientConfiguration( | ||
| "test_region", | ||
| DB::RemoteHostFilter(), | ||
| 1, | ||
| DB::S3::PocoHTTPClientConfiguration::RetryStrategy{.max_retries = 0}, | ||
| true, | ||
| true, | ||
| true, | ||
| false, | ||
| {}, | ||
| /* request_throttler = */ {}, | ||
| "http"), | ||
| Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, | ||
| DB::S3::ClientSettings()) | ||
| { | ||
| } | ||
|
|
||
| Aws::S3::Model::GetObjectOutcome GetObject([[maybe_unused]] const Aws::S3::Model::GetObjectRequest &) const override | ||
| { | ||
| UNREACHABLE(); | ||
| } | ||
| }; | ||
|
|
||
| std::shared_ptr<S3ObjectStorage> makeBaseStorage(S3::URI uri) | ||
| { | ||
| S3Capabilities cap; | ||
| ObjectStorageKeyGeneratorPtr gen; | ||
| const String disk_name = "s3"; | ||
| return std::make_shared<S3ObjectStorage>( | ||
| std::make_unique<ClientFake>(), std::make_unique<S3Settings>(), std::move(uri), cap, gen, disk_name); | ||
| } | ||
|
|
||
| void assertResolveReturnsBaseKey(const std::string & table_location, const std::string & path, S3::URI base_uri, const std::string & expected_key) | ||
| { | ||
| auto base = makeBaseStorage(std::move(base_uri)); | ||
| SecondaryStorages secondary_storages; | ||
| auto [storage, key] = resolveObjectStorageForPath(table_location, path, base, secondary_storages, nullptr); | ||
| ASSERT_EQ(storage.get(), base.get()); | ||
| ASSERT_EQ(key, expected_key); | ||
| } | ||
| } | ||
|
|
||
| TEST(ResolveObjectStorageForPathKey, S3SchemeKeyNoBucketInPath) | ||
| { | ||
| const char * path = "s3://my-bucket/dir/file.parquet"; | ||
| S3::URI target(path); | ||
| S3::URI base_uri; | ||
| base_uri.bucket = target.bucket; | ||
| base_uri.endpoint = target.endpoint; | ||
| assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); | ||
| } | ||
|
|
||
| TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyNoBucket) | ||
| { | ||
| const char * path = "https://my-bucket.s3.amazonaws.com/dir/file.parquet"; | ||
| S3::URI target(path); | ||
| ASSERT_TRUE(target.is_virtual_hosted_style); | ||
| S3::URI base_uri; | ||
| base_uri.bucket = target.bucket; | ||
| base_uri.endpoint = target.endpoint; | ||
| assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); | ||
| } | ||
|
|
||
| TEST(ResolveObjectStorageForPathKey, PathStyleHttpsBucketPrefixStripped) | ||
| { | ||
| const char * path = "https://s3.amazonaws.com/my-bucket/dir/file.parquet"; | ||
| S3::URI target(path); | ||
| ASSERT_FALSE(target.is_virtual_hosted_style); | ||
| ASSERT_EQ(target.bucket, "my-bucket"); | ||
| S3::URI base_uri; | ||
| base_uri.bucket = target.bucket; | ||
| base_uri.endpoint = target.endpoint; | ||
| assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet"); | ||
| } | ||
|
|
||
| TEST(ResolveObjectStorageForPathKey, PathStyleMinioPreservesPercentEncodedSlash) | ||
| { | ||
| const char * path = "http://minio:9000/bucket/partition=us%2Fwest/data.parquet"; | ||
| S3::URI target(path); | ||
| ASSERT_FALSE(target.is_virtual_hosted_style); | ||
| S3::URI base_uri; | ||
| base_uri.bucket = target.bucket; | ||
| base_uri.endpoint = target.endpoint; | ||
| assertResolveReturnsBaseKey("http://minio:9000/bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/data.parquet"); | ||
| } | ||
|
|
||
| TEST(ResolveObjectStorageForPathKey, S3SchemePreservesPercentEncodedSlash) | ||
| { | ||
| const char * path = "s3://bucket/partition=us%2Fwest/file.parquet"; | ||
| S3::URI target(path); | ||
| S3::URI base_uri; | ||
| base_uri.bucket = target.bucket; | ||
| base_uri.endpoint = target.endpoint; | ||
| assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/file.parquet"); | ||
| } | ||
|
|
||
| #endif | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Deriving
key_to_usefromtarget_decomposed.keyregresses path-style S3 locations likehttps://s3.amazonaws.com/<bucket>/<key>(and MinIO-stylehttp(s)://endpoint/<bucket>/<key>):SchemeAuthorityKeyleaves the bucket in the key, while bucket selection is already taken froms3_uri.bucket. That makes reads use a bucket-prefixed key against a storage already scoped to that bucket, so objects are looked up under the wrong path and can return 404 for valid files.Useful? React with 👍 / 👎.