@@ -2387,6 +2387,58 @@ newxzfile(const char *description, const char *mode, int type, int compress)
23872387 return new ;
23882388}
23892389
2390+ typedef enum { COMP_UNKNOWN = 0 , COMP_GZ , COMP_BZ , COMP_XZ } comp_type ;
2391+
2392+ static comp_type
2393+ comp_type_from_memory (char * buf , size_t len , Rboolean with_zlib , int * subtype )
2394+ {
2395+ if (len >= 2 && buf [0 ] == '\x1f' && buf [1 ] == '\x8b' )
2396+ return COMP_GZ ;
2397+ else if (with_zlib && len >=2 && buf [0 ] == '\x78' && buf [1 ] == '\x9c' )
2398+ /* zlib commression starts with 2 bytes, which for default settings are
2399+ \x78\x9c. We could use that */
2400+ return COMP_GZ ;
2401+ else if (len >= 10 && !strncmp (buf , "BZh" , 3 )) {
2402+ /* check also the block size and the block/eos magic to reduce
2403+ the risk of picking up an uncompressed file (PR#18768) */
2404+ if (buf [3 ] >= '1' && buf [3 ] <= '9' ) {
2405+ // 0x314159265359 (BCD (pi))
2406+ // 0x177245385090 (BCD sqrt(pi))
2407+ if (!memcmp (buf + 4 , "\x31\x41\x59\x26\x53\x59" , 6 ) ||
2408+ !memcmp (buf + 4 , "\x17\x72\x45\x38\x50\x90" , 6 ))
2409+
2410+ return COMP_BZ ;
2411+ }
2412+ } else if (len >= 5 && buf [0 ] == '\xFD' && !strncmp (buf + 1 , "7zXZ" , 4 )) {
2413+ * subtype = 0 ;
2414+ return COMP_XZ ;
2415+ } else if (len >= 5 && buf [0 ] == '\xFF' && !strncmp (buf + 1 , "LZMA" , 4 )) {
2416+ * subtype = 1 ;
2417+ return COMP_XZ ;
2418+ } else if (len >= 5 && !memcmp (buf , "]\0\0\200\0" , 5 )) {
2419+ * subtype = 1 ;
2420+ return COMP_XZ ;
2421+ } else if (len >= 4 && buf [0 ] == '\x89' && !strncmp (buf + 1 , "LZO" , 3 ))
2422+ error (_ ("this is a %s-compressed file which this build of R does not support" ),
2423+ "lzop" );
2424+ return COMP_UNKNOWN ;
2425+ }
2426+
2427+ static comp_type
2428+ comp_type_from_file (const char * name , Rboolean with_zlib , int * subtype )
2429+ {
2430+ FILE * fp = fopen (name , "rb" );
2431+ char buf [10 ];
2432+
2433+ if (fp ) {
2434+ size_t res = fread (buf , 1 , sizeof (buf ), fp );
2435+ fclose (fp );
2436+ if (res > 0 )
2437+ return comp_type_from_memory (buf , res , with_zlib , subtype );
2438+ }
2439+ return COMP_UNKNOWN ;
2440+ }
2441+
23902442/* op 0 is gzfile, 1 is bzfile, 2 is xv/lzma */
23912443attribute_hidden SEXP do_gzfile (SEXP call , SEXP op , SEXP args , SEXP env )
23922444{
@@ -2425,27 +2477,18 @@ attribute_hidden SEXP do_gzfile(SEXP call, SEXP op, SEXP args, SEXP env)
24252477 open = CHAR (STRING_ELT (sopen , 0 )); /* ASCII */
24262478 if (type == 0 && (!open [0 ] || open [0 ] == 'r' )) {
24272479 /* check magic no */
2428- FILE * fp = fopen (R_ExpandFileName (file ), "rb" );
2429- char buf [7 ];
2430- if (fp ) {
2431- size_t res ;
2432- memset (buf , 0 , 7 ); res = fread (buf , 5 , 1 , fp ); fclose (fp );
2433- if (res == 1 ) {
2434- if (!strncmp (buf , "BZh" , 3 )) type = 1 ;
2435- if ((buf [0 ] == '\xFD' ) && !strncmp (buf + 1 , "7zXZ" , 4 )) type = 2 ;
2436- if ((buf [0 ] == '\xFF' ) && !strncmp (buf + 1 , "LZMA" , 4 )) {
2437- type = 2 ; subtype = 1 ;
2438- }
2439- if (!memcmp (buf , "]\0\0\200\0" , 5 )) {
2440- type = 2 ; subtype = 1 ;
2441- }
2442- if ((buf [0 ] == '\x89' ) && !strncmp (buf + 1 , "LZO" , 3 ))
2443- error (_ ("this is a %s-compressed file which this build of R does not support" ), "lzop" );
2444- }
2480+ comp_type ct ;
2481+ ct = comp_type_from_file (R_ExpandFileName (file ), FALSE, & subtype );
2482+ switch (ct ) {
2483+ case COMP_GZ :
2484+ case COMP_UNKNOWN : type = 0 ; break ;
2485+ case COMP_BZ : type = 1 ; break ;
2486+ case COMP_XZ : type = 2 ; break ;
24452487 }
24462488 }
24472489 switch (type ) {
24482490 case 0 :
2491+ /* gzfile connection handles also transparent (uncompressed) files */
24492492 con = newgzfile (file , strlen (open ) ? open : "rb" , compress );
24502493 break ;
24512494 case 1 :
@@ -5732,36 +5775,21 @@ attribute_hidden SEXP do_url(SEXP call, SEXP op, SEXP args, SEXP env)
57325775 if (!raw &&
57335776 (!strlen (open ) || streql (open , "r" ) || streql (open , "rt" ))) {
57345777 /* check if this is a compressed file */
5735- FILE * fp = fopen (efn , "rb" );
5736- char buf [7 ];
5737- int ztype = -1 , subtype = 0 , compress = 0 ;
5738- if (fp ) {
5739- memset (buf , 0 , 7 );
5740- size_t res = fread (buf , 5 , 1 , fp );
5741- fclose (fp );
5742- if (res == 1 ) {
5743- if (buf [0 ] == '\x1f' && buf [1 ] == '\x8b' ) ztype = 0 ;
5744- if (!strncmp (buf , "BZh" , 3 )) ztype = 1 ;
5745- if ((buf [0 ] == '\xFD' ) && !strncmp (buf + 1 , "7zXZ" , 4 ))
5746- ztype = 2 ;
5747- if ((buf [0 ] == '\xFF' ) && !strncmp (buf + 1 , "LZMA" , 4 ))
5748- { ztype = 2 ; subtype = 1 ;}
5749- if (!memcmp (buf , "]\0\0\200\0" , 5 ))
5750- { ztype = 2 ; subtype = 1 ;}
5751- }
5752- }
5753- switch (ztype ) {
5754- case -1 :
5778+ int subtype = 0 , compress = 0 ;
5779+ comp_type ct = comp_type_from_file (efn , FALSE, & subtype );
5780+ switch (ct ) {
5781+ case COMP_UNKNOWN :
57555782 con = newfile (url , ienc , strlen (open ) ? open : "r" , raw );
57565783 break ;
5757- case 0 :
5784+ case COMP_GZ :
57585785 con = newgzfile (url , strlen (open ) ? open : "rt" , compress );
57595786 break ;
5760- case 1 :
5787+ case COMP_BZ :
57615788 con = newbzfile (url , strlen (open ) ? open : "rt" , compress );
57625789 break ;
5763- case 2 :
5764- con = newxzfile (url , strlen (open ) ? open : "rt" , subtype , compress );
5790+ case COMP_XZ :
5791+ con = newxzfile (url , strlen (open ) ? open : "rt" , subtype ,
5792+ compress );
57655793 break ;
57665794 }
57675795 } else
@@ -6778,19 +6806,16 @@ do_memDecompress(SEXP call, SEXP op, SEXP args, SEXP env)
67786806 type = asInteger (CADR (args ));
67796807 if (type == 5 ) {/* type = 5 is "unknown" */
67806808 char * p = (char * ) RAW (from );
6781- /* zlib commression starts with 2 bytes, which for default settings are
6782- \x78\x9c. We could use that */
6783- if (strncmp (p , "BZh" , 3 ) == 0 ) type = 3 ; /* bzip2 always uses a header */
6784- else if (p [0 ] == '\x1f' && p [1 ] == '\x8b' ) type = 2 ; /* gzip files */
6785- else if (p [0 ] == '\x78' && p [1 ] == '\x9c' ) type = 2 ; /* gzip files */
6786- else if ((p [0 ] == '\xFD' ) && !strncmp (p + 1 , "7zXZ" , 4 )) type = 4 ;
6787- else if ((p [0 ] == '\xFF' ) && !strncmp (p + 1 , "LZMA" , 4 )) {
6788- type = 4 ; subtype = 1 ;
6789- } else if (!memcmp (p , "]\0\0\200\0" , 5 )) {
6790- type = 4 ; subtype = 1 ;
6791- } else {
6809+ comp_type ct ;
6810+ ct = comp_type_from_memory (p , LENGTH (from ), TRUE, & subtype );
6811+ switch (ct ) {
6812+ case COMP_GZ : type = 2 ; break ;
6813+ case COMP_BZ : type = 3 ; break ;
6814+ case COMP_XZ : type = 4 ; break ;
6815+ case COMP_UNKNOWN :
67926816 warning (_ ("unknown compression, assuming none" ));
67936817 type = 1 ;
6818+ break ;
67946819 }
67956820 }
67966821
0 commit comments