Skip to content

Commit 59daa3f

Browse files
committed
Merge branch 'main' into instantiatesecretsparallel
2 parents 753a55f + 7d19b89 commit 59daa3f

File tree

8 files changed

+235
-24
lines changed

8 files changed

+235
-24
lines changed

.github/workflows/MainDistributionPipeline.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
1818
with:
1919
extension_name: httpfs
20-
duckdb_version: 9c2a64a
20+
duckdb_version: cc7c620e58625f6b5f53f724b6a9645f4ee0be6f
2121
ci_tools_version: main
2222

2323
duckdb-stable-deploy:
@@ -27,7 +27,7 @@ jobs:
2727
secrets: inherit
2828
with:
2929
extension_name: httpfs
30-
duckdb_version: 9c2a64a
30+
duckdb_version: cc7c620e58625f6b5f53f724b6a9645f4ee0be6f
3131
ci_tools_version: main
3232
deploy_latest: ${{ startsWith(github.ref, 'refs/heads/v') }}
3333
deploy_versioned: ${{ startsWith(github.ref, 'refs/heads/v') || github.ref == 'refs/heads/main' }}
@@ -37,7 +37,7 @@ jobs:
3737
uses: duckdb/extension-ci-tools/.github/workflows/_extension_code_quality.yml@main
3838
with:
3939
extension_name: httpfs
40-
duckdb_version: 1f9ecad746
40+
duckdb_version: cc7c620e58625f6b5f53f724b6a9645f4ee0be6f
4141
ci_tools_version: main
4242
extra_toolchains: 'python3'
4343
format_checks: 'format'

duckdb

Submodule duckdb updated 245 files

src/httpfs_curl_client.cpp

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ static std::string SelectCURLCertPath() {
3939
return std::string();
4040
}
4141

42-
static std::string cert_path = SelectCURLCertPath();
43-
4442
static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) {
4543
size_t totalSize = size * nmemb;
4644
std::string *str = static_cast<std::string *>(userp);
@@ -97,6 +95,7 @@ CURLHandle::CURLHandle(const string &token, const string &cert_path) {
9795
if (!cert_path.empty()) {
9896
curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str());
9997
}
98+
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_AUTO_CLIENT_CERT | CURLSSLOPT_NATIVE_CA);
10099
}
101100

102101
CURLHandle::~CURLHandle() {
@@ -117,6 +116,8 @@ class HTTPFSCurlClient : public HTTPClient {
117116
HTTPFSCurlClient(HTTPFSParams &http_params, const string &proto_host_port) {
118117
base_url = curl_url();
119118
curl_url_set(base_url, CURLUPART_URL, proto_host_port.c_str(), 0);
119+
stored_bearer_token = "";
120+
stored_cert_file_path = "";
120121
Initialize(http_params);
121122
}
122123
void Initialize(HTTPParams &http_p) override {
@@ -125,12 +126,25 @@ class HTTPFSCurlClient : public HTTPClient {
125126
if (!http_params.bearer_token.empty()) {
126127
bearer_token = http_params.bearer_token.c_str();
127128
}
129+
128130
state = http_params.state;
129131

130-
// call curl_global_init if not already done by another HTTPFS Client
131-
InitCurlGlobal();
132+
std::string cert_file_path;
133+
if (!http_params.ca_cert_file.empty()) {
134+
cert_file_path = http_params.ca_cert_file;
135+
}
136+
137+
if (!curl || (stored_bearer_token != bearer_token) || (stored_cert_file_path != cert_file_path)) {
138+
// call curl_global_init if not already done by another HTTPFS Client
139+
InitCurlGlobal();
140+
141+
stored_cert_file_path = cert_file_path;
132142

133-
curl = make_uniq<CURLHandle>(bearer_token, SelectCURLCertPath());
143+
if (cert_file_path.empty()) {
144+
cert_file_path = SelectCURLCertPath();
145+
}
146+
curl = make_uniq<CURLHandle>(bearer_token, cert_file_path);
147+
}
134148
request_info = make_uniq<RequestInfo>();
135149

136150
// set curl options
@@ -140,6 +154,8 @@ class HTTPFSCurlClient : public HTTPClient {
140154
// Curl re-uses connections by default
141155
if (!http_params.keep_alive) {
142156
curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 1L);
157+
} else {
158+
curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 0L);
143159
}
144160

145161
if (http_params.enable_curl_server_cert_verification) {
@@ -167,6 +183,11 @@ class HTTPFSCurlClient : public HTTPClient {
167183
curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback);
168184
curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body);
169185

186+
// Reset PROXY-related settings, so they are set only on proxy actually being there
187+
curl_easy_setopt(*curl, CURLOPT_PROXY, NULL);
188+
curl_easy_setopt(*curl, CURLOPT_PROXYUSERNAME, NULL);
189+
curl_easy_setopt(*curl, CURLOPT_PROXYPASSWORD, NULL);
190+
170191
if (!http_params.http_proxy.empty()) {
171192
curl_easy_setopt(*curl, CURLOPT_PROXY,
172193
StringUtil::Format("%s:%d", http_params.http_proxy, http_params.http_proxy_port).c_str());
@@ -452,6 +473,8 @@ class HTTPFSCurlClient : public HTTPClient {
452473
optional_ptr<HTTPState> state;
453474
unique_ptr<RequestInfo> request_info;
454475
CURLU *base_url = nullptr;
476+
string stored_bearer_token;
477+
string stored_cert_file_path;
455478

456479
static std::mutex &GetRefLock() {
457480
static std::mutex mtx;

src/include/s3fs.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ class S3FileSystem : public HTTPFileSystem {
223223
return false;
224224
}
225225
void RemoveFile(const string &filename, optional_ptr<FileOpener> opener = nullptr) override;
226+
void RemoveFiles(const vector<string> &filenames, optional_ptr<FileOpener> opener = nullptr) override;
226227
void RemoveDirectory(const string &directory, optional_ptr<FileOpener> opener = nullptr) override;
227228
void FileSync(FileHandle &handle) override;
228229
void Write(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) override;

src/s3fs.cpp

Lines changed: 109 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include "http_state.hpp"
1313

1414
#include "duckdb/common/string_util.hpp"
15+
#include "duckdb/common/crypto/md5.hpp"
16+
#include "duckdb/common/types/blob.hpp"
1517
#include "duckdb/function/scalar/string_common.hpp"
1618
#include "duckdb/main/secret/secret_manager.hpp"
1719
#include "duckdb/storage/buffer_manager.hpp"
@@ -1028,21 +1030,114 @@ void S3FileSystem::RemoveFile(const string &path, optional_ptr<FileOpener> opene
10281030
}
10291031
}
10301032

1033+
// Forward declaration for FindTagContents (defined later in file)
1034+
optional_idx FindTagContents(const string &response, const string &tag, idx_t cur_pos, string &result);
1035+
1036+
void S3FileSystem::RemoveFiles(const vector<string> &paths, optional_ptr<FileOpener> opener) {
1037+
if (paths.empty()) {
1038+
return;
1039+
}
1040+
1041+
struct BucketUrlInfo {
1042+
string prefix;
1043+
string http_proto;
1044+
string host;
1045+
string path;
1046+
S3AuthParams auth_params;
1047+
};
1048+
1049+
unordered_map<string, vector<string>> keys_by_bucket;
1050+
unordered_map<string, BucketUrlInfo> url_info_by_bucket;
1051+
1052+
for (auto &path : paths) {
1053+
FileOpenerInfo info = {path};
1054+
S3AuthParams auth_params = S3AuthParams::ReadFrom(opener, info);
1055+
auto parsed_url = S3UrlParse(path, auth_params);
1056+
ReadQueryParams(parsed_url.query_param, auth_params);
1057+
1058+
const string &bucket = parsed_url.bucket;
1059+
if (keys_by_bucket.find(bucket) == keys_by_bucket.end()) {
1060+
string bucket_path = parsed_url.path.substr(0, parsed_url.path.length() - parsed_url.key.length() - 1);
1061+
if (bucket_path.empty()) {
1062+
bucket_path = "/";
1063+
}
1064+
url_info_by_bucket[bucket] = {parsed_url.prefix, parsed_url.http_proto, parsed_url.host, bucket_path,
1065+
auth_params};
1066+
}
1067+
1068+
keys_by_bucket[bucket].push_back(parsed_url.key);
1069+
}
1070+
1071+
constexpr idx_t MAX_KEYS_PER_REQUEST = 1000;
1072+
1073+
for (auto &bucket_entry : keys_by_bucket) {
1074+
const string &bucket = bucket_entry.first;
1075+
const vector<string> &keys = bucket_entry.second;
1076+
const auto &url_info = url_info_by_bucket[bucket];
1077+
1078+
for (idx_t batch_start = 0; batch_start < keys.size(); batch_start += MAX_KEYS_PER_REQUEST) {
1079+
idx_t batch_end = MinValue<idx_t>(batch_start + MAX_KEYS_PER_REQUEST, keys.size());
1080+
1081+
std::stringstream xml_body;
1082+
xml_body << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
1083+
xml_body << "<Delete xmlns=\"http://s3.amazonaws.com/doc/2006-03-01/\">";
1084+
1085+
for (idx_t i = batch_start; i < batch_end; i++) {
1086+
xml_body << "<Object><Key>" << keys[i] << "</Key></Object>";
1087+
}
1088+
1089+
xml_body << "<Quiet>true</Quiet>";
1090+
xml_body << "</Delete>";
1091+
1092+
string body = xml_body.str();
1093+
1094+
MD5Context md5_context;
1095+
md5_context.Add(body);
1096+
data_t md5_hash[MD5Context::MD5_HASH_LENGTH_BINARY];
1097+
md5_context.Finish(md5_hash);
1098+
1099+
string_t md5_blob(const_char_ptr_cast(md5_hash), MD5Context::MD5_HASH_LENGTH_BINARY);
1100+
string content_md5 = Blob::ToBase64(md5_blob);
1101+
1102+
const string http_query_param_for_sig = "delete=";
1103+
const string http_query_param_for_url = "delete";
1104+
auto payload_hash = GetPayloadHash(const_cast<char *>(body.data()), body.length());
1105+
1106+
auto headers = CreateS3Header(url_info.path, http_query_param_for_sig, url_info.host, "s3", "POST",
1107+
url_info.auth_params, "", "", payload_hash, "");
1108+
headers["Content-MD5"] = content_md5;
1109+
headers["Content-Type"] = "application/xml";
1110+
1111+
string http_url = url_info.http_proto + url_info.host + S3FileSystem::UrlEncode(url_info.path) + "?" +
1112+
http_query_param_for_url;
1113+
string bucket_url = url_info.prefix + bucket + "/";
1114+
auto handle = OpenFile(bucket_url, FileFlags::FILE_FLAGS_READ, opener);
1115+
1116+
string result;
1117+
auto res = HTTPFileSystem::PostRequest(*handle, http_url, headers, result, const_cast<char *>(body.data()),
1118+
body.length());
1119+
1120+
if (res->status != HTTPStatusCode::OK_200) {
1121+
throw IOException("Failed to remove files: HTTP %d (%s)\n%s", static_cast<int>(res->status),
1122+
res->GetError(), result);
1123+
}
1124+
1125+
idx_t cur_pos = 0;
1126+
string error_content;
1127+
auto error_pos = FindTagContents(result, "Error", cur_pos, error_content);
1128+
if (error_pos.IsValid()) {
1129+
throw IOException("Failed to remove files: %s", error_content);
1130+
}
1131+
}
1132+
}
1133+
}
1134+
10311135
void S3FileSystem::RemoveDirectory(const string &path, optional_ptr<FileOpener> opener) {
1136+
vector<string> files_to_remove;
10321137
ListFiles(
1033-
path,
1034-
[&](const string &file, bool is_dir) {
1035-
try {
1036-
this->RemoveFile(file, opener);
1037-
} catch (IOException &e) {
1038-
string errmsg(e.what());
1039-
if (errmsg.find("No such file or directory") != std::string::npos) {
1040-
return;
1041-
}
1042-
throw;
1043-
}
1044-
},
1045-
opener.get());
1138+
path, [&](const string &file, bool is_dir) { files_to_remove.push_back(file); }, opener.get());
1139+
1140+
RemoveFiles(files_to_remove, opener);
10461141
}
10471142

10481143
void S3FileSystem::FileSync(FileHandle &handle) {
@@ -1137,7 +1232,7 @@ struct S3GlobResult : public LazyMultiFileList {
11371232
};
11381233

11391234
S3GlobResult::S3GlobResult(S3FileSystem &fs, const string &glob_pattern_p, optional_ptr<FileOpener> opener)
1140-
: glob_pattern(glob_pattern_p), opener(opener) {
1235+
: LazyMultiFileList(FileOpener::TryGetClientContext(opener)), glob_pattern(glob_pattern_p), opener(opener) {
11411236
if (!opener) {
11421237
throw InternalException("Cannot S3 Glob without FileOpener");
11431238
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# name: test/sql/copy/s3/s3_remove_files.test
2+
# description: Test RemoveFiles functionality via COPY OVERWRITE on S3
3+
# group: [s3]
4+
5+
require parquet
6+
7+
require httpfs
8+
9+
require-env S3_TEST_SERVER_AVAILABLE 1
10+
11+
require-env AWS_DEFAULT_REGION
12+
13+
require-env AWS_ACCESS_KEY_ID
14+
15+
require-env AWS_SECRET_ACCESS_KEY
16+
17+
require-env DUCKDB_S3_ENDPOINT
18+
19+
require-env DUCKDB_S3_USE_SSL
20+
21+
# override the default behaviour of skipping HTTP errors and connection failures: this test fails on connection issues
22+
set ignore_error_messages
23+
24+
foreach url_style path vhost
25+
26+
statement ok
27+
SET s3_url_style='${url_style}'
28+
29+
# Step 1: Clean up any leftover files from previous test runs (OVERWRITE clears the directory)
30+
# and create initial partitioned file (part_col=1)
31+
statement ok
32+
COPY (SELECT 1 as part_col, 'initial_1' as value) TO 's3://test-bucket/remove_files_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE);
33+
34+
statement ok
35+
COPY (SELECT 2 as part_col, 'initial_2' as value) TO 's3://test-bucket/remove_files_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE_OR_IGNORE);
36+
37+
# Step 2: Verify both partitions exist
38+
query I
39+
SELECT count(*) FROM glob('s3://test-bucket/remove_files_test/**/*.parquet')
40+
----
41+
2
42+
43+
query IT
44+
SELECT part_col, value FROM 's3://test-bucket/remove_files_test/**/*.parquet' ORDER BY part_col
45+
----
46+
1 initial_1
47+
2 initial_2
48+
49+
# Step 3: OVERWRITE with completely different data (part_col=99)
50+
# This should delete all existing files via RemoveFiles and create new ones
51+
statement ok
52+
COPY (SELECT 99 as part_col, 'new_data' as value) TO 's3://test-bucket/remove_files_test' (FORMAT PARQUET, PARTITION_BY (part_col), OVERWRITE);
53+
54+
# Step 4: Verify OLD partitions are removed
55+
query I
56+
SELECT count(*) FROM glob('s3://test-bucket/remove_files_test/part_col=1/*.parquet')
57+
----
58+
0
59+
60+
query I
61+
SELECT count(*) FROM glob('s3://test-bucket/remove_files_test/part_col=2/*.parquet')
62+
----
63+
0
64+
65+
# Step 5: Verify only NEW partition exists
66+
query I
67+
SELECT count(*) FROM glob('s3://test-bucket/remove_files_test/**/*.parquet')
68+
----
69+
1
70+
71+
query IT
72+
SELECT part_col, value FROM 's3://test-bucket/remove_files_test/**/*.parquet'
73+
----
74+
99 new_data
75+
76+
endloop

test/sql/httpfs/ca_cert_file.test

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# name: test/sql/httpfs/ca_cert_file.test
2+
# description: Test that invalid ca_cert_file causes SSL connection to fail
3+
# group: [httpfs]
4+
5+
require httpfs
6+
7+
statement ok
8+
SET enable_server_cert_verification = true;
9+
10+
statement ok
11+
SET ca_cert_file = 'README.md';
12+
13+
statement error
14+
FROM read_blob('https://duckdb.org/')
15+
----
16+
<REGEX>:.*IO Error.*SSL.*

0 commit comments

Comments
 (0)