Skip to content

Commit 4f2c94a

Browse files
Fix warnings and truncations around filesize_to_str() (#6987)
* filesize_to_str(): accept uint64_t, not size_t On 32-bit platforms, we would otherwise perform a 40-bit shift of a 32-bit size_t, which is undefined behaviour. Moreover, files can be larger than the address space of the process. * fread: better error for files that are too large On non-Windows, print the errno when file exists but we couldn't open it. That will happen for files larger than the address space when off_t is 32-bit (default on Linux+glibc). In case an operating system does let us open a large file, check the size before casting it to size_t. (This happens at least on Windows.) * filesize_to_str(): print binary SI prefixes * Update src/fread.c Co-authored-by: Michael Chirico <[email protected]> * Print both file size and SIZE_MAX upon failure Co-authored-by: Michael Chirico <[email protected]> * NEWS entries * Fix double blunder - Make sure to use the right platform-specific variable in both cases - Can't use filesize_to_str() more than once due to its static buffer * tweak NEWS --------- Co-authored-by: Michael Chirico <[email protected]> Co-authored-by: Michael Chirico <[email protected]>
1 parent db834d4 commit 4f2c94a

File tree

2 files changed

+24
-10
lines changed

2 files changed

+24
-10
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@
4747

4848
3. {data.table} now depends on R 3.4.0 (2017).
4949

50+
4. Changes to `fread()` output and errors:
51+
52+
+ When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
53+
+ On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
54+
+ With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
55+
5056
## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34) (20 Feb 2025)
5157

5258
### POTENTIALLY BREAKING CHANGES

src/fread.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -402,18 +402,18 @@ double wallclock(void)
402402
/**
403403
* Helper function to print file's size in human-readable format. This will
404404
* produce strings such as:
405-
* 44.74GB (48043231704 bytes)
406-
* 921MB (965757797 bytes)
407-
* 2.206MB (2313045 bytes)
408-
* 38.69KB (39615 bytes)
405+
* 44.74GiB (48043231704 bytes)
406+
* 921MiB (965757797 bytes)
407+
* 2.206MiB (2313045 bytes)
408+
* 38.69KiB (39615 bytes)
409409
* 214 bytes
410410
* 0 bytes
411411
* The function returns a pointer to a static string buffer, so the caller
412412
* should not attempt to deallocate the buffer, or call this function from
413413
* multiple threads at the same time, or hold on to the value returned for
414414
* extended periods of time.
415415
*/
416-
static const char* filesize_to_str(const size_t fsize)
416+
static const char* filesize_to_str(const uint64_t fsize)
417417
{
418418
static const char suffixes[] = {'T', 'G', 'M', 'K'};
419419
static char output[100];
@@ -426,12 +426,12 @@ static const char* filesize_to_str(const size_t fsize)
426426
}
427427
if (ndigits == 0 || (fsize == (fsize >> shift << shift))) {
428428
if (i < sizeof(suffixes)) {
429-
snprintf(output, sizeof(output), "%"PRIu64"%cB (%"PRIu64" bytes)", // # notranslate
430-
(fsize >> shift), suffixes[i], fsize);
429+
snprintf(output, sizeof(output), "%"PRIu64"%ciB (%"PRIu64" bytes)", // # notranslate
430+
fsize >> shift, suffixes[i], fsize);
431431
return output;
432432
}
433433
} else {
434-
snprintf(output, sizeof(output), "%.*f%cB (%"PRIu64" bytes)", // # notranslate
434+
snprintf(output, sizeof(output), "%.*f%ciB (%"PRIu64" bytes)", // # notranslate
435435
ndigits, (double)fsize / (1LL << shift), suffixes[i], fsize);
436436
return output;
437437
}
@@ -1415,12 +1415,16 @@ int freadMain(freadMainArgs _args) {
14151415
const char* fnam = args.filename;
14161416
#ifndef WIN32
14171417
int fd = open(fnam, O_RDONLY);
1418-
if (fd==-1) STOP(_("File not found: %s"),fnam);
1418+
if (fd==-1) STOP(_("Couldn't open file %s: %s"),fnam, strerror(errno));
14191419
struct stat stat_buf;
14201420
if (fstat(fd, &stat_buf) == -1) {
14211421
close(fd); // # nocov
14221422
STOP(_("Opened file ok but couldn't obtain its size: %s"), fnam); // # nocov
14231423
}
1424+
if (stat_buf.st_size > SIZE_MAX) {
1425+
close(fd); // # nocov
1426+
STOP(_("File size [%s] exceeds the address space: %s"), filesize_to_str(stat_buf.st_size), fnam); // # nocov
1427+
}
14241428
fileSize = (size_t) stat_buf.st_size;
14251429
if (fileSize == 0) {close(fd); STOP(_("File is empty: %s"), fnam);}
14261430
if (verbose) DTPRINT(_(" File opened, size = %s.\n"), filesize_to_str(fileSize));
@@ -1453,8 +1457,12 @@ int freadMain(freadMainArgs _args) {
14531457
if (hFile==INVALID_HANDLE_VALUE) STOP(_("Unable to open file after %d attempts (error %lu): %s"), attempts, GetLastError(), fnam);
14541458
LARGE_INTEGER liFileSize;
14551459
if (GetFileSizeEx(hFile,&liFileSize)==0) { CloseHandle(hFile); STOP(_("GetFileSizeEx failed (returned 0) on file: %s"), fnam); }
1460+
if (liFileSize.QuadPart > SIZE_MAX) {
1461+
CloseHandle(hFile); // # nocov
1462+
STOP(_("File size [%s] exceeds the address space: %s"), filesize_to_str(liFileSize.QuadPart), fnam); // # nocov
1463+
}
14561464
fileSize = (size_t)liFileSize.QuadPart;
1457-
if (fileSize<=0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
1465+
if (fileSize==0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
14581466
if (verbose) DTPRINT(_(" File opened, size = %s.\n"), filesize_to_str(fileSize));
14591467
HANDLE hMap=CreateFileMapping(hFile, NULL, PAGE_WRITECOPY, 0, 0, NULL);
14601468
if (hMap==NULL) { CloseHandle(hFile); STOP(_("This is Windows, CreateFileMapping returned error %lu for file %s"), GetLastError(), fnam); }

0 commit comments

Comments
 (0)