60
60
import com .oracle .graal .python .runtime .sequence .storage .SequenceStorage ;
61
61
import com .oracle .truffle .api .CompilerDirectives ;
62
62
import com .oracle .truffle .api .CompilerDirectives .TruffleBoundary ;
63
+ import com .oracle .truffle .api .TruffleFile ;
63
64
import com .oracle .truffle .api .dsl .Cached ;
64
65
import com .oracle .truffle .api .dsl .Fallback ;
65
66
import com .oracle .truffle .api .dsl .GenerateNodeFactory ;
70
71
import java .io .File ;
71
72
import java .io .IOException ;
72
73
import java .io .InputStream ;
73
- import java .util .Enumeration ;
74
+ import java .nio .file .StandardOpenOption ;
75
+ import java .util .ArrayList ;
74
76
import java .util .List ;
75
77
import java .util .zip .ZipEntry ;
76
- import java .util .zip .ZipFile ;
78
+ import java .util .zip .ZipInputStream ;
77
79
78
80
@ CoreFunctions (extendClasses = PythonBuiltinClassType .PZipImporter )
79
81
public class ZipImporterBuiltins extends PythonBuiltins {
80
82
81
83
private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called" ;
82
84
85
+ /**
86
+ * This stream is need to find locations of zip entries in the zip file. The main purpose of
87
+ * this is to find location of the first local file header in the zipfile, which doesn't have to
88
+ * be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
89
+ * signature `PK\003\004` but with a code, that should be executed.
90
+ *
91
+ * In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
92
+ * location signature.
93
+ *
94
+ * This stream also improve performance of unzipping files in ZipImporter case. A content of
95
+ * file is obtained from the zip, when it's needed (imported). The locations of zip entry
96
+ * positions are cached in the zip directory cache. When content of a file is needed, then
97
+ * previous zip entries are skipped and ZipInputStream is created from the required position.
98
+ *
99
+ * New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
100
+ *
101
+ * It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
102
+ * file. This situation has to be handled separately.
103
+ */
104
+ private static class LOCZipEntryStream extends InputStream {
105
+ // states of the simple lexer
106
+ private static final byte AFTER_P = 1 ;
107
+ private static final byte AFTER_PK = 2 ;
108
+ private static final byte AFTER_PK3 = 3 ;
109
+ private static final byte BEFORE_P = 0 ;
110
+
111
+ private byte state = BEFORE_P ; // the default state
112
+ private static final byte [] LOC_SIG = new byte []{80 , 75 , 3 , 4 }; // zip location signature
113
+
114
+ private final InputStream in ;
115
+ long pos = 0 ; // position in the input stream
116
+ private boolean readFirstLoc ; // is the first location detected?
117
+ List <Long > positions ; // store the locations
118
+
119
+ public LOCZipEntryStream (InputStream in ) {
120
+ this .readFirstLoc = false ;
121
+ this .positions = new ArrayList <>();
122
+ this .in = in ;
123
+ }
124
+
125
+ @ Override
126
+ public int read () throws IOException {
127
+ if (readFirstLoc ) {
128
+ // This expect that the bytes of the first LOC was consumed by this stream
129
+ // (due to calling findFirstEntryPosition) and now the stream
130
+ // has to push back the LOC bytes
131
+ int index = (int ) (pos - positions .get (0 ));
132
+ if (index < LOC_SIG .length ) {
133
+ pos ++;
134
+ return LOC_SIG [index ];
135
+ }
136
+ readFirstLoc = false ; // never do it again
137
+ }
138
+ int ch = in .read ();
139
+ pos ++;
140
+ switch (state ) {
141
+ case BEFORE_P :
142
+ if (ch == LOC_SIG [0 ]) {
143
+ state = AFTER_P ;
144
+ }
145
+ break ;
146
+ case AFTER_P :
147
+ if (ch == LOC_SIG [1 ]) {
148
+ state = AFTER_PK ;
149
+ } else {
150
+ state = BEFORE_P ;
151
+ }
152
+ break ;
153
+ case AFTER_PK :
154
+ if (ch == LOC_SIG [2 ]) {
155
+ state = AFTER_PK3 ;
156
+ } else {
157
+ state = BEFORE_P ;
158
+ }
159
+ break ;
160
+ case AFTER_PK3 :
161
+ if (ch == LOC_SIG [3 ]) {
162
+ positions .add (pos - 4 ); // store the LOC position
163
+ }
164
+ state = BEFORE_P ;
165
+ }
166
+ return ch ;
167
+ }
168
+
169
+ void findFirstEntryPosition () throws IOException {
170
+ while (positions .isEmpty () && read () != -1 ) {
171
+ // do nothing here, just read until the first LOC is found
172
+ }
173
+ if (!positions .isEmpty ()) {
174
+ pos -= 4 ;
175
+ readFirstLoc = true ;
176
+ }
177
+ }
178
+ }
179
+
83
180
@ Override
84
181
protected List <? extends NodeFactory <? extends PythonBuiltinBaseNode >> getNodeFactories () {
85
182
return ZipImporterBuiltinsFactory .getFactories ();
@@ -99,65 +196,107 @@ private void initZipImporter(PZipImporter self, String path) {
99
196
throw raise (PythonErrorType .ZipImportError , "archive path is empty" );
100
197
}
101
198
102
- File file = new File (path );
199
+ TruffleFile tfile = getContext (). getEnv (). getTruffleFile (path );
103
200
String prefix = "" ;
104
201
String archive = "" ;
105
202
while (true ) {
106
- File fullPathFile = new File (file .getPath ());
107
- try {
108
- if (fullPathFile .isFile ()) {
109
- archive = file .getPath ();
110
- break ;
111
- }
112
- } catch (SecurityException se ) {
113
- // continue
203
+ if (tfile .isRegularFile ()) {
204
+ // we don't have to store absolute path
205
+ archive = tfile .getPath ();
206
+ break ;
114
207
}
115
-
116
- // back up one path element
117
- File parentFile = file .getParentFile ();
208
+ TruffleFile parentFile = tfile .getParent ();
118
209
if (parentFile == null ) {
119
210
break ;
120
211
}
121
-
122
- prefix = file .getName () + PZipImporter .SEPARATOR + prefix ;
123
- file = parentFile ;
212
+ prefix = tfile .getName () + PZipImporter .SEPARATOR + prefix ;
213
+ tfile = parentFile ;
124
214
}
125
- ZipFile zipFile = null ;
126
215
127
- if (file .exists () && file .isFile ()) {
128
- try {
129
- zipFile = new ZipFile (file );
130
- } catch (IOException e ) {
131
- throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
132
- }
133
- }
134
- if (zipFile == null ) {
135
- throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
136
- }
137
- Object files = self .getZipDirectoryCache ().getItem (path );
138
- if (files == null ) {
139
- PDict filesDict = factory ().createDict ();
140
- Enumeration <? extends ZipEntry > entries = zipFile .entries ();
141
- while (entries .hasMoreElements ()) {
142
- ZipEntry entry = entries .nextElement ();
143
- PTuple tuple = factory ().createTuple (new Object []{
144
- zipFile .getName () + PZipImporter .SEPARATOR + entry .getName (),
145
- // for our implementation currently we don't need these
146
- // these properties to store there. Keeping them for
147
- // compatibility.
148
- entry .getMethod (),
149
- entry .getCompressedSize (),
150
- entry .getSize (),
151
- entry .getLastModifiedTime ().toMillis (),
152
- entry .getCrc ()});
153
- filesDict .setItem (entry .getName (), tuple );
216
+ if (tfile .exists () && tfile .isRegularFile ()) {
217
+ Object files = self .getZipDirectoryCache ().getItem (path );
218
+ if (files == null ) {
219
+ // fill the cache
220
+ PDict filesDict = factory ().createDict ();
221
+ ZipInputStream zis = null ;
222
+ LOCZipEntryStream locis = null ;
223
+ try {
224
+ locis = new LOCZipEntryStream (tfile .newInputStream (StandardOpenOption .READ ));
225
+ locis .findFirstEntryPosition (); // find location of the first zip entry
226
+ if (locis .positions .isEmpty ()) {
227
+ // no PK\003\004 found -> not a correct zip file
228
+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
229
+ }
230
+ zis = new ZipInputStream (locis ); // and create new ZipInput stream from this
231
+ // location
232
+ ZipEntry entry ;
233
+
234
+ // help variable to handle case when there LOC is in content of a file
235
+ long lastZipEntryCSize = 0 ;
236
+ long lastZipEntryPos = 0 ;
237
+ int lastZipLocFileHeaderSize = 0 ;
238
+ long zipEntryPos ;
239
+
240
+ byte [] extraField ;
241
+ while ((entry = zis .getNextEntry ()) != null ) {
242
+ zipEntryPos = locis .positions .remove (0 );
243
+ // handles situation when the local file signature is
244
+ // in the content of a file
245
+ while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos ) {
246
+ zipEntryPos = locis .positions .remove (0 );
247
+ }
248
+
249
+ PTuple tuple = factory ().createTuple (new Object []{
250
+ tfile .getPath () + PZipImporter .SEPARATOR + entry .getName (),
251
+ // for our implementation currently we don't need these
252
+ // these properties to store there. Keeping them for
253
+ // compatibility.
254
+ entry .getMethod (),
255
+ lastZipEntryCSize = entry .getCompressedSize (),
256
+ entry .getSize (),
257
+ entry .getLastModifiedTime ().toMillis (),
258
+ entry .getCrc (),
259
+ // store the entry position for faster reading content
260
+ lastZipEntryPos = zipEntryPos
261
+ });
262
+ filesDict .setItem (entry .getName (), tuple );
263
+ // count local file header from the last zipentry
264
+ lastZipLocFileHeaderSize = 30 + entry .getName ().length ();
265
+ extraField = entry .getExtra ();
266
+ if (extraField != null ) {
267
+ lastZipLocFileHeaderSize += extraField .length ;
268
+ }
269
+ }
270
+ } catch (IOException ex ) {
271
+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
272
+ } finally {
273
+ if (zis != null ) {
274
+ try {
275
+ zis .close ();
276
+ } catch (IOException e ) {
277
+ // just ignore it.
278
+ }
279
+ } else {
280
+ if (locis != null ) {
281
+ try {
282
+ locis .close ();
283
+ } catch (IOException e ) {
284
+ // just ignore it.
285
+ }
286
+ }
287
+ }
288
+ }
289
+ files = filesDict ;
290
+ self .getZipDirectoryCache ().setItem (path , files );
154
291
}
155
- files = filesDict ;
156
- self .getZipDirectoryCache ().setItem (path , files );
292
+ self .setArchive (archive );
293
+ self .setPrefix (prefix );
294
+ self .setFiles ((PDict ) files );
295
+
296
+ } else {
297
+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
157
298
}
158
- self .setArchive (archive );
159
- self .setPrefix (prefix );
160
- self .setFiles ((PDict ) files );
299
+
161
300
}
162
301
163
302
@ Specialization
@@ -324,28 +463,35 @@ public PBytes doit(PZipImporter self, String pathname) {
324
463
if (fileSize < 0 ) {
325
464
throw raise (PythonErrorType .ZipImportError , "negative data size" );
326
465
}
327
- ZipFile zip = null ;
466
+ long streamPosition = (long ) tocEntry .getArray ()[6 ];
467
+ ZipInputStream zis = null ;
468
+ TruffleFile tfile = getContext ().getEnv ().getTruffleFile (archive );
328
469
try {
329
- zip = new ZipFile (archive );
330
- ZipEntry entry = zip .getEntry (key );
331
- InputStream in = zip .getInputStream (entry );
470
+ InputStream in = tfile .newInputStream (StandardOpenOption .READ );
471
+ in .skip (streamPosition ); // we can fast skip bytes, because there is cached position
472
+ // of the zip entry
473
+ zis = new ZipInputStream (in );
474
+ ZipEntry entry = zis .getNextEntry ();
475
+ if (entry == null || !entry .getName ().equals (key )) {
476
+ throw raise (PythonErrorType .ZipImportError , "zipimport: wrong cached file position" );
477
+ }
332
478
int byteSize = (int ) fileSize ;
333
479
if (byteSize != fileSize ) {
334
480
throw raise (PythonErrorType .ZipImportError , "zipimport: cannot read archive members large than 2GB" );
335
481
}
336
482
byte [] bytes = new byte [byteSize ];
337
483
int bytesRead = 0 ;
338
484
while (bytesRead < byteSize ) {
339
- bytesRead += in .read (bytes , bytesRead , byteSize - bytesRead );
485
+ bytesRead += zis .read (bytes , bytesRead , byteSize - bytesRead );
340
486
}
341
- in .close ();
487
+ zis .close ();
342
488
return factory ().createBytes (bytes );
343
489
} catch (IOException e ) {
344
490
throw raise (PythonErrorType .ZipImportError , "zipimport: can't read data" );
345
491
} finally {
346
- if (zip != null ) {
492
+ if (zis != null ) {
347
493
try {
348
- zip .close ();
494
+ zis .close ();
349
495
} catch (IOException e ) {
350
496
// just ignore it.
351
497
}
0 commit comments