Skip to content

Commit 79aed53

Browse files
MichaelChiricophilippechataignonaitap
authored
Fix up some potential memory leaks in fwrite (#6757)
* More careful about free() before any STOP(), #ifndef location * Also don't assign mystream unless is_gzip * missing nocov ends * zstreams on stack, no more thread_streams on heap * back to !BUFF style of alloc checking * Avoid the leak on zero-row fwrite * Avoid memory leak on error path This isn't covered by the tests, but manually failing this allocation in vgdb results in a leak otherwise: 268,096 (5,952 direct, 262,144 indirect) bytes in 1 blocks are definitely lost in loss record 1,574 of 1,601 at 0x48407B4: malloc (vg_replace_malloc.c:381) by 0x74ACD86: deflateInit2_ (in /lib/x86_64-linux-gnu/libz.so.1.2.13) by 0x90EA5232: init_stream (fwrite.c:576) by 0x90EA5EB0: fwriteMain (fwrite.c:806) by 0x90EA79EE: fwriteR (fwriteR.c:310) * Typo, translate Co-authored-by: aitap <[email protected]> * more translations of DTPRINT * Skip redundant 'else' * new cite in NEWS --------- Co-authored-by: Philippe Chataignon <[email protected]> Co-authored-by: Ivan K <[email protected]>
1 parent 4899b39 commit 79aed53

File tree

2 files changed

+59
-61
lines changed

2 files changed

+59
-61
lines changed

NEWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ rowwiseDT(
6969

7070
6. `fread()` gains `logicalYN` argument to read columns consisting only of strings `Y`, `N` as `logical` (as opposed to character), [#4563](https://github.com/Rdatatable/data.table/issues/4563). The default is controlled by option `datatable.logicalYN`, itself defaulting to `FALSE`, for back-compatibility -- some smaller tables (especially sharded tables) might inadvertently read a "true" string column as `logical` and cause bugs. This is particularly important for tables with a column named `y` or `n` -- automatic header detection under `logicalYN=TRUE` will see these values in the first row as being "data" as opposed to column names. A parallel option was not included for `fwrite()` at this time -- users looking for a compact representation of logical columns can still use `fwrite(logical01=TRUE)`. We also opted for now to check only `Y`, `N` and not `Yes`/`No`/`YES`/`NO`.
7171

72-
7. `fwrite()` with `compress="gzip"` produces compatible gz files when composed of multiple independent chunks owing to parallelization, [#6356](https://github.com/Rdatatable/data.table/issues/6356). Earlier `fwrite()` versions could have issues with HTTP upload using `Content-Encoding: gzip` and `Transfer-Encoding: chunked`. Thanks to @oliverfoster for report and @philippechataignon for the fix.
72+
7. `fwrite()` with `compress="gzip"` produces compatible gz files when composed of multiple independent chunks owing to parallelization, [#6356](https://github.com/Rdatatable/data.table/issues/6356). Earlier `fwrite()` versions could have issues with HTTP upload using `Content-Encoding: gzip` and `Transfer-Encoding: chunked`. Thanks to @oliverfoster for report and @philippechataignon for the fix. Thanks also @aitap for pre-release testing that found some possible memory leaks in the initial fix.
7373

7474
8. `fwrite()` gains a new parameter `compressLevel` to control compression level for gzip, [#5506](https://github.com/Rdatatable/data.table/issues/5506). This parameter balances compression speed and total compression, and corresponds directly to the analogous command-line parameter, e.g. `compressLevel=4` corresponds to passing `-4`; the default, `6`, matches the command-line default, i.e. equivalent to passing `-6`. Thanks @mgarbuzov for the request and @philippechataignon for implementing.
7575

src/fwrite.c

Lines changed: 58 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -784,7 +784,7 @@ void fwriteMain(fwriteMainArgs args)
784784
}
785785

786786
// alloc nth write buffers
787-
errno=0;
787+
errno = 0;
788788
size_t alloc_size = nth * buffSize;
789789
if (verbose) {
790790
DTPRINT(_("Allocate %zu bytes (%zu MiB) for buffPool\n"), alloc_size, alloc_size / MEGA);
@@ -797,28 +797,20 @@ void fwriteMain(fwriteMainArgs args)
797797

798798
// init compress variables
799799
#ifndef NOZLIB
800-
z_stream *thread_streams = NULL;
800+
z_stream strm;
801+
// NB: fine to free() this even if unallocated
801802
char *zbuffPool = NULL;
802803
size_t zbuffSize = 0;
803804
size_t compress_len = 0;
804805
if (args.is_gzip) {
805-
// alloc zlib streams
806-
thread_streams = (z_stream*) malloc(nth * sizeof(z_stream));
807-
if (verbose) {
808-
DTPRINT(_("Allocate %zu bytes for thread_streams\n"), nth * sizeof(z_stream));
806+
// compute zbuffSize which is the same for each thread
807+
if (init_stream(&strm) != Z_OK) {
808+
// # nocov start
809+
free(buffPool);
810+
STOP(_("Can't init stream structure for deflateBound"));
811+
// # nocov end
809812
}
810-
if (!thread_streams)
811-
STOP(_("Failed to allocated %d bytes for threads_streams."), (int)(nth * sizeof(z_stream))); // #nocov
812-
// VLA on stack should be fine for nth structs; in zlib v1.2.11 sizeof(struct)==112 on 64bit
813-
// not declared inside the parallel region because solaris appears to move the struct in
814-
// memory when the #pragma omp for is entered, which causes zlib's internal self reference
815-
// pointer to mismatch, #4099
816-
817-
// compute zbuffSize which is the same for each thread
818-
z_stream *stream = thread_streams;
819-
if (init_stream(stream) != Z_OK)
820-
STOP(_("Can't init stream structure for deflateBound")); // #nocov
821-
zbuffSize = deflateBound(stream, buffSize);
813+
zbuffSize = deflateBound(&strm, buffSize);
822814
if (verbose)
823815
DTPRINT(_("zbuffSize=%d returned from deflateBound\n"), (int)zbuffSize);
824816

@@ -832,11 +824,13 @@ void fwriteMain(fwriteMainArgs args)
832824
if (!zbuffPool) {
833825
// # nocov start
834826
free(buffPool);
827+
deflateEnd(&strm);
835828
STOP(_("Unable to allocate %zu MiB * %d thread compressed buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."),
836829
zbuffSize / MEGA, nth, errno, strerror(errno));
837830
// # nocov end
838831
}
839832
}
833+
840834
#endif
841835

842836
// write header
@@ -880,11 +874,8 @@ void fwriteMain(fwriteMainArgs args)
880874
DTPRINT("%s", buff);
881875
} else {
882876
int ret0=0, ret1=0, ret2=0;
883-
if (args.is_gzip) {
884877
#ifndef NOZLIB
885-
z_stream *stream = thread_streams;
886-
if (init_stream(stream) != Z_OK)
887-
STOP(_("Can't init stream structure for writing header")); // #nocov
878+
if (args.is_gzip) {
888879
char* zbuff = zbuffPool;
889880
// write minimal gzip header
890881
char* header = "\037\213\10\0\0\0\0\0\0\3";
@@ -895,19 +886,26 @@ void fwriteMain(fwriteMainArgs args)
895886
size_t zbuffUsed = zbuffSize;
896887
len = (size_t)(ch - buff);
897888
crc = crc32(crc, (unsigned char*)buff, len);
898-
ret1 = compressbuff(stream, zbuff, &zbuffUsed, buff, len);
889+
ret1 = compressbuff(&strm, zbuff, &zbuffUsed, buff, len);
890+
deflateEnd(&strm);
899891
if (ret1==Z_OK) {
900892
ret2 = WRITE(f, zbuff, (int)zbuffUsed);
901893
compress_len += zbuffUsed;
902894
}
903-
#endif
904895
} else {
896+
#endif
905897
ret2 = WRITE(f, buff, (int)(ch-buff));
898+
#ifndef NOZLIB
906899
}
900+
#endif
907901
if (ret0 == -1 || ret1 || ret2 == -1) {
908902
// # nocov start
909903
int errwrite = errno; // capture write errno now in case close fails with a different errno
910904
CLOSE(f);
905+
free(buffPool);
906+
#ifndef NOZLIB
907+
free(zbuffPool);
908+
#endif
911909
if (ret0 == -1) STOP(_("Can't write gzip header error: %d"), ret0);
912910
else if (ret1) STOP(_("Compress gzip error: %d"), ret1);
913911
else STOP(_("%s: '%s'"), strerror(errwrite), args.filename);
@@ -922,6 +920,10 @@ void fwriteMain(fwriteMainArgs args)
922920
if (args.nrow == 0) {
923921
if (verbose)
924922
DTPRINT(_("No data rows present (nrow==0)\n"));
923+
free(buffPool);
924+
#ifndef NOZLIB
925+
free(zbuffPool);
926+
#endif
925927
if (f != -1 && CLOSE(f))
926928
STOP(_("%s: '%s'"), strerror(errno), args.filename); // # nocov
927929
return;
@@ -947,14 +949,14 @@ void fwriteMain(fwriteMainArgs args)
947949
char* ch = myBuff;
948950

949951
#ifndef NOZLIB
952+
z_stream mystream;
950953
size_t mylen = 0;
951954
int mycrc = 0;
952-
z_stream *mystream = &thread_streams[me];
953955
void *myzBuff = NULL;
954956
size_t myzbuffUsed = 0;
955957
if (args.is_gzip) {
956958
myzBuff = zbuffPool + me * zbuffSize;
957-
if (init_stream(mystream) != Z_OK) { // this should be thread safe according to zlib documentation
959+
if (init_stream(&mystream) != Z_OK) { // this should be thread safe according to zlib documentation
958960
failed = true; // # nocov
959961
my_failed_compress = -998; // # nocov
960962
}
@@ -1002,7 +1004,8 @@ void fwriteMain(fwriteMainArgs args)
10021004
myzbuffUsed = zbuffSize;
10031005
mylen = (size_t)(ch - myBuff);
10041006
mycrc = crc32(0, (unsigned char*)myBuff, mylen);
1005-
int ret = compressbuff(mystream, myzBuff, &myzbuffUsed, myBuff, mylen);
1007+
int ret = compressbuff(&mystream, myzBuff, &myzbuffUsed, myBuff, mylen);
1008+
deflateEnd(&mystream);
10061009
if (ret) {
10071010
failed=true;
10081011
my_failed_compress=ret;
@@ -1023,25 +1026,27 @@ void fwriteMain(fwriteMainArgs args)
10231026
if (f == -1) {
10241027
*ch='\0'; // standard C string end marker so DTPRINT knows where to stop
10251028
DTPRINT("%s", myBuff);
1026-
} else if (args.is_gzip) {
1029+
} else
10271030
#ifndef NOZLIB
1028-
ret = WRITE(f, myzBuff, (int)myzbuffUsed);
1029-
compress_len += myzbuffUsed;
1031+
if (args.is_gzip) {
1032+
ret = WRITE(f, myzBuff, (int)myzbuffUsed);
1033+
compress_len += myzbuffUsed;
1034+
} else
10301035
#endif
1031-
} else {
1036+
{
10321037
ret = WRITE(f, myBuff, (int)(ch-myBuff));
10331038
}
10341039
if (ret == -1) {
10351040
failed=true; // # nocov
10361041
failed_write=errno; // # nocov
10371042
}
10381043

1039-
if (args.is_gzip) {
10401044
#ifndef NOZLIB
1045+
if (args.is_gzip) {
10411046
crc = crc32_combine(crc, mycrc, mylen);
10421047
len += mylen;
1043-
#endif
10441048
}
1049+
#endif
10451050

10461051
int used = 100 * ((double)(ch - myBuff)) / buffSize; // percentage of original buffMB
10471052
if (used > maxBuffUsedPC)
@@ -1067,36 +1072,29 @@ void fwriteMain(fwriteMainArgs args)
10671072
}
10681073
}
10691074
}
1070-
if (args.is_gzip) {
1071-
#ifndef NOZLIB
1072-
deflateEnd(mystream);
1073-
#endif
1074-
}
10751075

10761076
} // end of parallel for loop
10771077

1078+
free(buffPool);
1079+
1080+
#ifndef NOZLIB
1081+
free(zbuffPool);
1082+
10781083
/* put a 4-byte integer into a byte array in LSB order */
10791084
#define PUT4(a,b) ((a)[0]=(b), (a)[1]=(b)>>8, (a)[2]=(b)>>16, (a)[3]=(b)>>24)
10801085

10811086
// write gzip tailer with crc and len
1082-
if (args.is_gzip) {
1083-
#ifndef NOZLIB
1084-
unsigned char tail[10];
1085-
tail[0] = 3;
1086-
tail[1] = 0;
1087-
PUT4(tail + 2, crc);
1088-
PUT4(tail + 6, len);
1089-
int ret = WRITE(f, tail, 10);
1090-
compress_len += 10;
1091-
if (ret == -1)
1092-
STOP("Error: can't write gzip tailer"); // # nocov
1093-
#endif
1094-
}
1095-
1096-
free(buffPool);
1097-
#ifndef NOZLIB
1098-
free(thread_streams);
1099-
free(zbuffPool);
1087+
if (args.is_gzip) {
1088+
unsigned char tail[10];
1089+
tail[0] = 3;
1090+
tail[1] = 0;
1091+
PUT4(tail + 2, crc);
1092+
PUT4(tail + 6, len);
1093+
int ret = WRITE(f, tail, 10);
1094+
compress_len += 10;
1095+
if (ret == -1)
1096+
STOP(_("Failed to write gzip trailer")); // # nocov
1097+
}
11001098
#endif
11011099

11021100
// Finished parallel region and can call R API safely now.
@@ -1112,13 +1110,13 @@ void fwriteMain(fwriteMainArgs args)
11121110
}
11131111

11141112
if (verbose) {
1115-
if (args.is_gzip) {
11161113
#ifndef NOZLIB
1117-
DTPRINT("zlib: uncompressed length=%zu (%zu MiB), compressed length=%zu (%zu MiB), ratio=%.1f%%, crc=%x\n",
1114+
if (args.is_gzip) {
1115+
DTPRINT(_("zlib: uncompressed length=%zu (%zu MiB), compressed length=%zu (%zu MiB), ratio=%.1f%%, crc=%x\n"),
11181116
len, len / MEGA, compress_len, compress_len / MEGA, len != 0 ? (100.0 * compress_len) / len : 0, crc);
1119-
#endif
11201117
}
1121-
DTPRINT("Written %"PRId64" rows in %.3f secs using %d thread%s. MaxBuffUsed=%d%%\n",
1118+
#endif
1119+
DTPRINT(_("Written %"PRId64" rows in %.3f secs using %d thread%s. MaxBuffUsed=%d%%\n"),
11221120
args.nrow, 1.0*(wallclock()-t0), nth, nth ==1 ? "" : "s", maxBuffUsedPC);
11231121
}
11241122

0 commit comments

Comments
 (0)