Skip to content

Commit 311d786

Browse files
committed
Added fix for path style URI parsing
1 parent 7d57785 commit 311d786

File tree

2 files changed

+131
-0
lines changed

2 files changed

+131
-0
lines changed

src/Storages/ObjectStorage/Utils.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,15 @@ std::pair<DB::ObjectStoragePtr, std::string> resolveObjectStorageForPath(
518518
// Use key (parsed without URI decoding) so that percent-encoded
519519
// characters in object keys (e.g. %2F in Iceberg partition paths) are preserved.
520520
std::string key_to_use = target_decomposed.key;
521+
522+
// For path-style HTTP(S) URLs, SchemeAuthorityKey puts <bucket>/<key> in .key
523+
// because the bucket lives in the URL path, not the hostname.
524+
// Strip the bucket prefix so the key is relative to the bucket.
525+
if (!s3_uri.is_virtual_hosted_style
526+
&& key_to_use.starts_with(s3_uri.bucket + "/"))
527+
{
528+
key_to_use = key_to_use.substr(s3_uri.bucket.size() + 1);
529+
}
521530

522531
bool use_base_storage = false;
523532
if (base_storage->getType() == ObjectStorageType::S3)
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include <gtest/gtest.h>
2+
3+
#include <base/defines.h>
4+
#include <IO/S3/Credentials.h>
5+
#include "config.h"
6+
7+
#if USE_AWS_S3 && USE_AVRO
8+
9+
#include <Disks/DiskObjectStorage/ObjectStorages/S3/S3ObjectStorage.h>
10+
#include <IO/S3/Client.h>
11+
#include <IO/S3/PocoHTTPClient.h>
12+
#include <IO/S3/URI.h>
13+
#include <Storages/ObjectStorage/Utils.h>
14+
15+
using namespace DB;
16+
17+
namespace
18+
{
19+
struct ClientFake : DB::S3::Client
20+
{
21+
explicit ClientFake()
22+
: DB::S3::Client(
23+
1,
24+
DB::S3::ServerSideEncryptionKMSConfig(),
25+
std::make_shared<Aws::Auth::SimpleAWSCredentialsProvider>("test_access_key", "test_secret"),
26+
DB::S3::ClientFactory::instance().createClientConfiguration(
27+
"test_region",
28+
DB::RemoteHostFilter(),
29+
1,
30+
DB::S3::PocoHTTPClientConfiguration::RetryStrategy{.max_retries = 0},
31+
true,
32+
true,
33+
true,
34+
false,
35+
{},
36+
/* request_throttler = */ {},
37+
"http"),
38+
Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
39+
DB::S3::ClientSettings())
40+
{
41+
}
42+
43+
Aws::S3::Model::GetObjectOutcome GetObject([[maybe_unused]] const Aws::S3::Model::GetObjectRequest &) const override
44+
{
45+
UNREACHABLE();
46+
}
47+
};
48+
49+
std::shared_ptr<S3ObjectStorage> makeBaseStorage(S3::URI uri)
50+
{
51+
S3Capabilities cap;
52+
ObjectStorageKeyGeneratorPtr gen;
53+
const String disk_name = "s3";
54+
return std::make_shared<S3ObjectStorage>(
55+
std::make_unique<ClientFake>(), std::make_unique<S3Settings>(), std::move(uri), cap, gen, disk_name);
56+
}
57+
58+
void assertResolveReturnsBaseKey(const std::string & table_location, const std::string & path, S3::URI base_uri, const std::string & expected_key)
59+
{
60+
auto base = makeBaseStorage(std::move(base_uri));
61+
SecondaryStorages secondary_storages;
62+
auto [storage, key] = resolveObjectStorageForPath(table_location, path, base, secondary_storages, nullptr);
63+
ASSERT_EQ(storage.get(), base.get());
64+
ASSERT_EQ(key, expected_key);
65+
}
66+
}
67+
68+
TEST(ResolveObjectStorageForPathKey, S3SchemeKeyNoBucketInPath)
69+
{
70+
const char * path = "s3://my-bucket/dir/file.parquet";
71+
S3::URI target(path);
72+
S3::URI base_uri;
73+
base_uri.bucket = target.bucket;
74+
base_uri.endpoint = target.endpoint;
75+
assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet");
76+
}
77+
78+
TEST(ResolveObjectStorageForPathKey, VirtualHostedHttpsKeyNoBucket)
79+
{
80+
const char * path = "https://my-bucket.s3.amazonaws.com/dir/file.parquet";
81+
S3::URI target(path);
82+
ASSERT_TRUE(target.is_virtual_hosted_style);
83+
S3::URI base_uri;
84+
base_uri.bucket = target.bucket;
85+
base_uri.endpoint = target.endpoint;
86+
assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet");
87+
}
88+
89+
TEST(ResolveObjectStorageForPathKey, PathStyleHttpsBucketPrefixStripped)
90+
{
91+
const char * path = "https://s3.amazonaws.com/my-bucket/dir/file.parquet";
92+
S3::URI target(path);
93+
ASSERT_FALSE(target.is_virtual_hosted_style);
94+
ASSERT_EQ(target.bucket, "my-bucket");
95+
S3::URI base_uri;
96+
base_uri.bucket = target.bucket;
97+
base_uri.endpoint = target.endpoint;
98+
assertResolveReturnsBaseKey("s3://my-bucket/warehouse", path, std::move(base_uri), "dir/file.parquet");
99+
}
100+
101+
TEST(ResolveObjectStorageForPathKey, PathStyleMinioPreservesPercentEncodedSlash)
102+
{
103+
const char * path = "http://minio:9000/bucket/partition=us%2Fwest/data.parquet";
104+
S3::URI target(path);
105+
ASSERT_FALSE(target.is_virtual_hosted_style);
106+
S3::URI base_uri;
107+
base_uri.bucket = target.bucket;
108+
base_uri.endpoint = target.endpoint;
109+
assertResolveReturnsBaseKey("http://minio:9000/bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/data.parquet");
110+
}
111+
112+
TEST(ResolveObjectStorageForPathKey, S3SchemePreservesPercentEncodedSlash)
113+
{
114+
const char * path = "s3://bucket/partition=us%2Fwest/file.parquet";
115+
S3::URI target(path);
116+
S3::URI base_uri;
117+
base_uri.bucket = target.bucket;
118+
base_uri.endpoint = target.endpoint;
119+
assertResolveReturnsBaseKey("s3://bucket/warehouse", path, std::move(base_uri), "partition=us%2Fwest/file.parquet");
120+
}
121+
122+
#endif

0 commit comments

Comments
 (0)