Skip to content

Commit aef21f3

Browse files
author
kalibera
committed
Improve precision of detecting bzip2 compressed stream (PR#18768). Reduce
code duplication. git-svn-id: https://svn.r-project.org/R/trunk@87446 00db46b3-68df-0310-9c12-caf00c1e9a41
1 parent 6af86c3 commit aef21f3

File tree

1 file changed

+77
-52
lines changed

1 file changed

+77
-52
lines changed

src/main/connections.c

Lines changed: 77 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,6 +2387,58 @@ newxzfile(const char *description, const char *mode, int type, int compress)
23872387
return new;
23882388
}
23892389

2390+
typedef enum { COMP_UNKNOWN = 0, COMP_GZ, COMP_BZ, COMP_XZ } comp_type;
2391+
2392+
static comp_type
2393+
comp_type_from_memory(char *buf, size_t len, Rboolean with_zlib, int *subtype)
2394+
{
2395+
if(len >= 2 && buf[0] == '\x1f' && buf[1] == '\x8b')
2396+
return COMP_GZ;
2397+
else if(with_zlib && len>=2 && buf[0] == '\x78' && buf[1] == '\x9c')
2398+
/* zlib commression starts with 2 bytes, which for default settings are
2399+
\x78\x9c. We could use that */
2400+
return COMP_GZ;
2401+
else if(len >= 10 && !strncmp(buf, "BZh", 3)) {
2402+
/* check also the block size and the block/eos magic to reduce
2403+
the risk of picking up an uncompressed file (PR#18768) */
2404+
if (buf[3] >= '1' && buf[3] <= '9') {
2405+
// 0x314159265359 (BCD (pi))
2406+
// 0x177245385090 (BCD sqrt(pi))
2407+
if (!memcmp(buf+4, "\x31\x41\x59\x26\x53\x59", 6) ||
2408+
!memcmp(buf+4, "\x17\x72\x45\x38\x50\x90", 6))
2409+
2410+
return COMP_BZ;
2411+
}
2412+
} else if(len >= 5 && buf[0] == '\xFD' && !strncmp(buf+1, "7zXZ", 4)) {
2413+
*subtype = 0;
2414+
return COMP_XZ;
2415+
} else if(len >= 5 && buf[0] == '\xFF' && !strncmp(buf+1, "LZMA", 4)) {
2416+
*subtype = 1;
2417+
return COMP_XZ;
2418+
} else if(len >= 5 && !memcmp(buf, "]\0\0\200\0", 5)) {
2419+
*subtype = 1;
2420+
return COMP_XZ;
2421+
} else if(len >= 4 && buf[0] == '\x89' && !strncmp(buf+1, "LZO", 3))
2422+
error(_("this is a %s-compressed file which this build of R does not support"),
2423+
"lzop");
2424+
return COMP_UNKNOWN;
2425+
}
2426+
2427+
static comp_type
2428+
comp_type_from_file(const char *name, Rboolean with_zlib, int *subtype)
2429+
{
2430+
FILE *fp = fopen(name, "rb");
2431+
char buf[10];
2432+
2433+
if (fp) {
2434+
size_t res = fread(buf, 1, sizeof(buf), fp);
2435+
fclose(fp);
2436+
if(res > 0)
2437+
return comp_type_from_memory(buf, res, with_zlib, subtype);
2438+
}
2439+
return COMP_UNKNOWN;
2440+
}
2441+
23902442
/* op 0 is gzfile, 1 is bzfile, 2 is xv/lzma */
23912443
attribute_hidden SEXP do_gzfile(SEXP call, SEXP op, SEXP args, SEXP env)
23922444
{
@@ -2425,27 +2477,18 @@ attribute_hidden SEXP do_gzfile(SEXP call, SEXP op, SEXP args, SEXP env)
24252477
open = CHAR(STRING_ELT(sopen, 0)); /* ASCII */
24262478
if (type == 0 && (!open[0] || open[0] == 'r')) {
24272479
/* check magic no */
2428-
FILE *fp = fopen(R_ExpandFileName(file), "rb");
2429-
char buf[7];
2430-
if (fp) {
2431-
size_t res;
2432-
memset(buf, 0, 7); res = fread(buf, 5, 1, fp); fclose(fp);
2433-
if(res == 1) {
2434-
if(!strncmp(buf, "BZh", 3)) type = 1;
2435-
if((buf[0] == '\xFD') && !strncmp(buf+1, "7zXZ", 4)) type = 2;
2436-
if((buf[0] == '\xFF') && !strncmp(buf+1, "LZMA", 4)) {
2437-
type = 2; subtype = 1;
2438-
}
2439-
if(!memcmp(buf, "]\0\0\200\0", 5)) {
2440-
type = 2; subtype = 1;
2441-
}
2442-
if((buf[0] == '\x89') && !strncmp(buf+1, "LZO", 3))
2443-
error(_("this is a %s-compressed file which this build of R does not support"), "lzop");
2444-
}
2480+
comp_type ct;
2481+
ct = comp_type_from_file(R_ExpandFileName(file), FALSE, &subtype);
2482+
switch(ct) {
2483+
case COMP_GZ:
2484+
case COMP_UNKNOWN: type = 0; break;
2485+
case COMP_BZ: type = 1; break;
2486+
case COMP_XZ: type = 2; break;
24452487
}
24462488
}
24472489
switch(type) {
24482490
case 0:
2491+
/* gzfile connection handles also transparent (uncompressed) files */
24492492
con = newgzfile(file, strlen(open) ? open : "rb", compress);
24502493
break;
24512494
case 1:
@@ -5732,36 +5775,21 @@ attribute_hidden SEXP do_url(SEXP call, SEXP op, SEXP args, SEXP env)
57325775
if (!raw &&
57335776
(!strlen(open) || streql(open, "r") || streql(open, "rt"))) {
57345777
/* check if this is a compressed file */
5735-
FILE *fp = fopen(efn, "rb");
5736-
char buf[7];
5737-
int ztype = -1, subtype = 0, compress = 0;
5738-
if (fp) {
5739-
memset(buf, 0, 7);
5740-
size_t res = fread(buf, 5, 1, fp);
5741-
fclose(fp);
5742-
if(res == 1) {
5743-
if(buf[0] == '\x1f' && buf[1] == '\x8b') ztype = 0;
5744-
if(!strncmp(buf, "BZh", 3)) ztype = 1;
5745-
if((buf[0] == '\xFD') && !strncmp(buf+1, "7zXZ", 4))
5746-
ztype = 2;
5747-
if((buf[0] == '\xFF') && !strncmp(buf+1, "LZMA", 4))
5748-
{ ztype = 2; subtype = 1;}
5749-
if(!memcmp(buf, "]\0\0\200\0", 5))
5750-
{ ztype = 2; subtype = 1;}
5751-
}
5752-
}
5753-
switch(ztype) {
5754-
case -1:
5778+
int subtype = 0, compress = 0;
5779+
comp_type ct = comp_type_from_file(efn, FALSE, &subtype);
5780+
switch(ct) {
5781+
case COMP_UNKNOWN:
57555782
con = newfile(url, ienc, strlen(open) ? open : "r", raw);
57565783
break;
5757-
case 0:
5784+
case COMP_GZ:
57585785
con = newgzfile(url, strlen(open) ? open : "rt", compress);
57595786
break;
5760-
case 1:
5787+
case COMP_BZ:
57615788
con = newbzfile(url, strlen(open) ? open : "rt", compress);
57625789
break;
5763-
case 2:
5764-
con = newxzfile(url, strlen(open) ? open : "rt", subtype, compress);
5790+
case COMP_XZ:
5791+
con = newxzfile(url, strlen(open) ? open : "rt", subtype,
5792+
compress);
57655793
break;
57665794
}
57675795
} else
@@ -6778,19 +6806,16 @@ do_memDecompress(SEXP call, SEXP op, SEXP args, SEXP env)
67786806
type = asInteger(CADR(args));
67796807
if (type == 5) {/* type = 5 is "unknown" */
67806808
char *p = (char *) RAW(from);
6781-
/* zlib commression starts with 2 bytes, which for default settings are
6782-
\x78\x9c. We could use that */
6783-
if (strncmp(p, "BZh", 3) == 0) type = 3; /* bzip2 always uses a header */
6784-
else if(p[0] == '\x1f' && p[1] == '\x8b') type = 2; /* gzip files */
6785-
else if(p[0] == '\x78' && p[1] == '\x9c') type = 2; /* gzip files */
6786-
else if((p[0] == '\xFD') && !strncmp(p+1, "7zXZ", 4)) type = 4;
6787-
else if((p[0] == '\xFF') && !strncmp(p+1, "LZMA", 4)) {
6788-
type = 4; subtype = 1;
6789-
} else if(!memcmp(p, "]\0\0\200\0", 5)) {
6790-
type = 4; subtype = 1;
6791-
} else {
6809+
comp_type ct;
6810+
ct = comp_type_from_memory(p, LENGTH(from), TRUE, &subtype);
6811+
switch(ct) {
6812+
case COMP_GZ: type = 2; break;
6813+
case COMP_BZ: type = 3; break;
6814+
case COMP_XZ: type = 4; break;
6815+
case COMP_UNKNOWN:
67926816
warning(_("unknown compression, assuming none"));
67936817
type = 1;
6818+
break;
67946819
}
67956820
}
67966821

0 commit comments

Comments
 (0)