Skip to content

Commit 7df8b5c

Browse files
authored
Merge pull request #92 from samansmink/full-file-download-fallback
Automatic full file download fallback
2 parents c00e1f2 + 58f15dc commit 7df8b5c

File tree

10 files changed

+174
-13
lines changed

10 files changed

+174
-13
lines changed

.github/workflows/MinioTests.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ jobs:
2222
GEN: ninja
2323
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
2424
VCPKG_TARGET_TRIPLET: x64-linux
25+
PYTHON_HTTP_SERVER_URL: http://localhost:8008
26+
PYTHON_HTTP_SERVER_DIR: /tmp/python_test_server
2527

2628
steps:
2729
- uses: actions/checkout@v4
@@ -50,7 +52,9 @@ jobs:
5052

5153
- name: Build
5254
shell: bash
53-
run: make
55+
run: |
56+
echo -e "\nduckdb_extension_load(tpch)\n" >> extension_config.cmake
57+
make
5458
5559
- name: Start S3/HTTP test server
5660
shell: bash
@@ -62,6 +66,13 @@ jobs:
6266
source ./scripts/run_s3_test_server.sh
6367
sleep 30
6468
69+
- name: Run & Populate test server
70+
shell: bash
71+
run: |
72+
mkdir -p $PYTHON_HTTP_SERVER_DIR
73+
cd $PYTHON_HTTP_SERVER_DIR
74+
python3 -m http.server 8008 &
75+
6576
- name: Test
6677
shell: bash
6778
run: |

extension/httpfs/httpfs.cpp

Lines changed: 85 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ unique_ptr<HTTPParams> HTTPFSUtil::InitializeParameters(optional_ptr<FileOpener>
4444
// Setting lookups
4545
FileOpener::TryGetCurrentSetting(opener, "http_timeout", result->timeout, info);
4646
FileOpener::TryGetCurrentSetting(opener, "force_download", result->force_download, info);
47+
FileOpener::TryGetCurrentSetting(opener, "auto_fallback_to_full_download", result->auto_fallback_to_full_download, info);
4748
FileOpener::TryGetCurrentSetting(opener, "http_retries", result->retries, info);
4849
FileOpener::TryGetCurrentSetting(opener, "http_retry_wait_ms", result->retry_wait_ms, info);
4950
FileOpener::TryGetCurrentSetting(opener, "http_retry_backoff", result->retry_backoff, info);
@@ -232,8 +233,7 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
232233
if (response.HasHeader("Content-Length")) {
233234
auto content_length = stoll(response.GetHeaderValue("Content-Length"));
234235
if ((idx_t)content_length != buffer_out_len) {
235-
throw HTTPException("HTTP GET error: Content-Length from server mismatches requested "
236-
"range, server may not support range requests.");
236+
RangeRequestNotSupportedException::Throw();
237237
}
238238
}
239239
}
@@ -256,6 +256,8 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
256256
return true;
257257
});
258258

259+
get_request.try_request = hfh.auto_fallback_to_full_file_download;
260+
259261
auto response = http_util.Request(get_request, http_client);
260262

261263
hfh.StoreClient(std::move(http_client));
@@ -345,9 +347,36 @@ unique_ptr<FileHandle> HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file
345347
return std::move(handle);
346348
}
347349

348-
// Buffered read from http file.
349-
// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set
350-
void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
350+
bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len) {
351+
auto res = GetRangeRequest(handle, url, header_map, file_offset, buffer_out, buffer_out_len);
352+
353+
if (res) {
354+
// Request succeeded TODO: fix upstream that 206 is not considered success
355+
if (res->Success() || res->status == HTTPStatusCode::PartialContent_206 || res->status == HTTPStatusCode::Accepted_202) {
356+
return true;
357+
}
358+
359+
// Request failed and we have a request error
360+
if (res->HasRequestError()) {
361+
ErrorData error (res->GetRequestError());
362+
363+
// Special case: we can do a retry with a full file download
364+
if (error.Type() == RangeRequestNotSupportedException::TYPE && error.RawMessage() == RangeRequestNotSupportedException::MESSAGE) {
365+
auto &hfh = handle.Cast<HTTPFileHandle>();
366+
if (hfh.http_params.auto_fallback_to_full_download) {
367+
return false;
368+
}
369+
370+
}
371+
error.Throw();
372+
}
373+
throw HTTPException(*res, "Request returned HTTP %d for HTTP %s to '%s'",
374+
static_cast<int>(res->status), EnumUtil::ToString(RequestType::GET_REQUEST), res->url);
375+
}
376+
throw IOException("Unknown error for HTTP %s to '%s'", EnumUtil::ToString(RequestType::GET_REQUEST), url);
377+
}
378+
379+
bool HTTPFileSystem::ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
351380
auto &hfh = handle.Cast<HTTPFileHandle>();
352381

353382
D_ASSERT(hfh.http_params.state);
@@ -358,7 +387,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
358387
memcpy(buffer, hfh.cached_file_handle->GetData() + location, nr_bytes);
359388
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
360389
hfh.file_offset = location + nr_bytes;
361-
return;
390+
return true;
362391
}
363392

364393
idx_t to_read = nr_bytes;
@@ -367,21 +396,23 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
367396
// Don't buffer when DirectIO is set or when we are doing parallel reads
368397
bool skip_buffer = hfh.flags.DirectIO() || hfh.flags.RequireParallelAccess();
369398
if (skip_buffer && to_read > 0) {
370-
GetRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read);
399+
if (!TryRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read)) {
400+
return false;
401+
}
371402
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
372403
// Update handle status within critical section for parallel access.
373404
if (hfh.flags.RequireParallelAccess()) {
374405
std::lock_guard<std::mutex> lck(hfh.mu);
375406
hfh.buffer_available = 0;
376407
hfh.buffer_idx = 0;
377408
hfh.file_offset = location + nr_bytes;
378-
return;
409+
return true;
379410
}
380411

381412
hfh.buffer_available = 0;
382413
hfh.buffer_idx = 0;
383414
hfh.file_offset = location + nr_bytes;
384-
return;
415+
return true;
385416
}
386417

387418
if (location >= hfh.buffer_start && location < hfh.buffer_end) {
@@ -413,13 +444,17 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
413444

414445
// Bypass buffer if we read more than buffer size
415446
if (to_read > new_buffer_available) {
416-
GetRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read);
447+
if (!TryRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read)) {
448+
return false;
449+
}
417450
hfh.buffer_available = 0;
418451
hfh.buffer_idx = 0;
419452
start_offset += to_read;
420453
break;
421454
} else {
422-
GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available);
455+
if (!TryRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available)) {
456+
return false;
457+
}
423458
hfh.buffer_available = new_buffer_available;
424459
hfh.buffer_idx = 0;
425460
hfh.buffer_start = start_offset;
@@ -429,6 +464,32 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
429464
}
430465
hfh.file_offset = location + nr_bytes;
431466
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
467+
return true;
468+
}
469+
470+
// Buffered read from http file.
471+
// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set
472+
void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
473+
auto success = ReadInternal(handle, buffer, nr_bytes, location);
474+
if (success) {
475+
return;
476+
}
477+
478+
// ReadInternal returned false. This means the regular path of querying the file with range requests failed. We will
479+
// attempt to download the full file and retry.
480+
481+
if (handle.logger) {
482+
DUCKDB_LOG_WARN(handle.logger, "Falling back to full file download for file '%s': the server does not support HTTP range requests. Performance and memory usage are potentially degraded.", handle.path);
483+
}
484+
485+
auto &hfh = handle.Cast<HTTPFileHandle>();
486+
487+
bool should_write_cache = false;
488+
hfh.FullDownload(*this, should_write_cache);
489+
490+
if (!ReadInternal(handle, buffer, nr_bytes, location)) {
491+
throw HTTPException("Failed to read from HTTP file after automatically retrying a full file download.");
492+
}
432493
}
433494

434495
int64_t HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes) {
@@ -642,6 +703,19 @@ void HTTPFileHandle::LoadFileInfo() {
642703
initialized = true;
643704
}
644705

706+
707+
void HTTPFileHandle::TryAddLogger(FileOpener &opener) {
708+
auto context = opener.TryGetClientContext();
709+
if (context) {
710+
logger = context->logger;
711+
return;
712+
}
713+
auto database = opener.TryGetDatabase();
714+
if (database) {
715+
logger = database->GetLogManager().GlobalLoggerReference();
716+
}
717+
}
718+
645719
void HTTPFileHandle::Initialize(optional_ptr<FileOpener> opener) {
646720
auto &hfs = file_system.Cast<HTTPFileSystem>();
647721
http_params.state = HTTPState::TryGetState(opener);

extension/httpfs/httpfs_extension.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ static void LoadInternal(ExtensionLoader &loader) {
4747
config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3));
4848
config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100));
4949
config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false));
50+
config.AddExtensionOption("auto_fallback_to_full_download", "Allows automatically falling back to full file downloads when possible.", LogicalType::BOOLEAN, Value(true));
5051
// Reduces the number of requests made while waiting, for example retry_wait_ms of 50 and backoff factor of 2 will
5152
// result in wait times of 0 50 100 200 400...etc.
5253
config.AddExtensionOption("http_retry_backoff", "Backoff factor for exponentially increasing retry wait time",

extension/httpfs/include/httpfs.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@
1414

1515
namespace duckdb {
1616

17+
class RangeRequestNotSupportedException {
18+
public:
19+
// Call static Throw instead: if thrown as exception DuckDB can't catch it.
20+
explicit RangeRequestNotSupportedException() = delete;
21+
22+
static constexpr ExceptionType TYPE = ExceptionType::HTTP;
23+
static constexpr const char *MESSAGE = "Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`";
24+
25+
static void Throw() {
26+
throw HTTPException(MESSAGE);
27+
}
28+
};
29+
1730
class HTTPClientCache {
1831
public:
1932
//! Get a client from the client cache
@@ -50,6 +63,8 @@ class HTTPFileHandle : public FileHandle {
5063
string etag;
5164
bool force_full_download;
5265
bool initialized = false;
66+
67+
bool auto_fallback_to_full_file_download = true;
5368

5469
// In write overwrite mode, we are not interested in the current state of the file: we're overwriting it.
5570
bool write_overwrite_mode = false;
@@ -88,7 +103,10 @@ class HTTPFileHandle : public FileHandle {
88103
//! Perform a HEAD request to get the file info (if not yet loaded)
89104
void LoadFileInfo();
90105

91-
private:
106+
//! TODO: make base function virtual?
107+
void TryAddLogger(FileOpener &opener);
108+
109+
public:
92110
//! Fully downloads a file
93111
void FullDownload(HTTPFileSystem &hfs, bool &should_write_cache);
94112
};
@@ -157,6 +175,8 @@ class HTTPFileSystem : public FileSystem {
157175
}
158176

159177
virtual HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url);
178+
bool TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len);
179+
bool ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location);
160180

161181
protected:
162182
virtual duckdb::unique_ptr<HTTPFileHandle> CreateHandle(const OpenFileInfo &file, FileOpenFlags flags,

extension/httpfs/include/httpfs_client.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ struct HTTPFSParams : public HTTPParams {
1616
static constexpr bool DEFAULT_ENABLE_SERVER_CERT_VERIFICATION = false;
1717
static constexpr uint64_t DEFAULT_HF_MAX_PER_PAGE = 0;
1818
static constexpr bool DEFAULT_FORCE_DOWNLOAD = false;
19+
static constexpr bool AUTO_FALLBACK_TO_FULL_DOWNLOAD = true;
1920

2021
bool force_download = DEFAULT_FORCE_DOWNLOAD;
22+
bool auto_fallback_to_full_download = AUTO_FALLBACK_TO_FULL_DOWNLOAD;
2123
bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION;
2224
bool enable_curl_server_cert_verification = true;
2325
idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE;

extension/httpfs/include/s3fs.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class S3FileHandle : public HTTPFileHandle {
116116
: HTTPFileHandle(fs, file, flags, std::move(http_params_p)), auth_params(auth_params_p),
117117
config_params(config_params_p), uploads_in_progress(0), parts_uploaded(0), upload_finalized(false),
118118
uploader_has_error(false), upload_exception(nullptr) {
119+
auto_fallback_to_full_file_download = false;
119120
if (flags.OpenForReading() && flags.OpenForWriting()) {
120121
throw NotImplementedException("Cannot open an HTTP file for both reading and writing");
121122
} else if (flags.OpenForAppending()) {
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# name: test/sql/full_file_download_fallback.test
2+
# group: [full_file_download]
3+
4+
require parquet
5+
6+
require httpfs
7+
8+
require tpch
9+
10+
require-env PYTHON_HTTP_SERVER_URL
11+
12+
require-env PYTHON_HTTP_SERVER_DIR
13+
14+
statement ok
15+
CALL enable_logging();
16+
17+
statement ok
18+
call dbgen(sf=1);
19+
20+
statement ok
21+
copy lineitem to '${PYTHON_HTTP_SERVER_DIR}/lineitem.csv'
22+
23+
statement ok
24+
drop table lineitem;
25+
26+
statement ok
27+
CREATE view lineitem AS FROM '${PYTHON_HTTP_SERVER_URL}/lineitem.csv';
28+
29+
query I
30+
pragma tpch(6);
31+
----
32+
123141078.22829981
33+
34+
query I
35+
select count(*) from duckdb_logs where log_level='WARN' and message like '%Falling back to full%'
36+
----
37+
2
38+
39+
statement ok
40+
set auto_fallback_to_full_download=false
41+
42+
statement error
43+
pragma tpch(6);
44+
----
45+
HTTP Error: Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`
46+

test/sql/secret/secret_aws.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT
1414

1515
require-env DUCKDB_S3_USE_SSL
1616

17+
set ignore_error_messages
18+
1719
require httpfs
1820

1921
require parquet

test/sql/secret/secret_refresh.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT
1414

1515
require-env DUCKDB_S3_USE_SSL
1616

17+
set ignore_error_messages
18+
1719
require httpfs
1820

1921
require parquet

test/sql/secret/secret_refresh_attach.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ require-env DUCKDB_S3_USE_SSL
1616

1717
require-env S3_ATTACH_DB
1818

19+
set ignore_error_messages
20+
1921
require httpfs
2022

2123
require parquet

0 commit comments

Comments
 (0)