72
72
import java .io .IOException ;
73
73
import java .io .InputStream ;
74
74
import java .nio .file .StandardOpenOption ;
75
+ import java .util .ArrayList ;
75
76
import java .util .List ;
76
77
import java .util .zip .ZipEntry ;
77
78
import java .util .zip .ZipInputStream ;
@@ -81,6 +82,98 @@ public class ZipImporterBuiltins extends PythonBuiltins {
81
82
82
83
private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called" ;
83
84
85
+ /**
86
+ * This stream is need to find locations of zip entries in the zip file. The main purpose of
87
+ * this is to find location of the first local file header in the zipfile, which doesn't have to
88
+ * be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
89
+ * signature `PK\003\004` but with a code, that should be executed.
90
+ *
91
+ * In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
92
+ * location signature.
93
+ *
94
+ * This stream also improve performance of unzipping files in ZipImporter case. A content of
95
+ * file is obtained from the zip, when it's needed (imported). The locations of zip entry
96
+ * positions are cached in the zip directory cache. When content of a file is needed, then
97
+ * previous zip entries are skipped and ZipInputStream is created from the required position.
98
+ *
99
+ * New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
100
+ *
101
+ * It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
102
+ * file. This situation has to be handled separately.
103
+ */
104
+ private static class LOCZipEntryStream extends InputStream {
105
+ // states of the simple lexer
106
+ private static final byte AFTER_P = 1 ;
107
+ private static final byte AFTER_PK = 2 ;
108
+ private static final byte AFTER_PK3 = 3 ;
109
+ private static final byte BEFORE_P = 0 ;
110
+
111
+ private byte state = BEFORE_P ; // the default state
112
+ private static final byte [] LOC_SIG = new byte []{80 , 75 , 3 , 4 }; // zip location signature
113
+
114
+ private final InputStream in ;
115
+ long pos = 0 ; // position in the input stream
116
+ private boolean readFirstLoc ; // is the first location detected?
117
+ List <Long > positions ; // store the locations
118
+
119
+ public LOCZipEntryStream (InputStream in ) {
120
+ this .readFirstLoc = false ;
121
+ this .positions = new ArrayList <>();
122
+ this .in = in ;
123
+ }
124
+
125
+ @ Override
126
+ public int read () throws IOException {
127
+ if (readFirstLoc ) {
128
+ // This expect that the bytes of the first LOC was consumed by this stream
129
+ // (due to calling findFirstEntryPosition) and now the stream
130
+ // has to push back the LOC bytes
131
+ int index = (int ) (pos - positions .get (0 ));
132
+ if (index < LOC_SIG .length ) {
133
+ pos ++;
134
+ return LOC_SIG [index ];
135
+ }
136
+ readFirstLoc = false ; // never do it again
137
+ }
138
+ int ch = in .read ();
139
+ pos ++;
140
+ switch (state ) {
141
+ case BEFORE_P :
142
+ if (ch == LOC_SIG [0 ]) {
143
+ state = AFTER_P ;
144
+ }
145
+ break ;
146
+ case AFTER_P :
147
+ if (ch == LOC_SIG [1 ]) {
148
+ state = AFTER_PK ;
149
+ } else {
150
+ state = BEFORE_P ;
151
+ }
152
+ break ;
153
+ case AFTER_PK :
154
+ if (ch == LOC_SIG [2 ]) {
155
+ state = AFTER_PK3 ;
156
+ } else {
157
+ state = BEFORE_P ;
158
+ }
159
+ break ;
160
+ case AFTER_PK3 :
161
+ if (ch == LOC_SIG [3 ]) {
162
+ positions .add (pos - 4 ); // store the LOC position
163
+ }
164
+ state = BEFORE_P ;
165
+ }
166
+ return ch ;
167
+ }
168
+
169
+ void findFirstEntryPosition () throws IOException {
170
+ while (positions .isEmpty () && read () != -1 ) {
171
+ }
172
+ pos -= 4 ;
173
+ readFirstLoc = true ;
174
+ }
175
+ }
176
+
84
177
@ Override
85
178
protected List <? extends NodeFactory <? extends PythonBuiltinBaseNode >> getNodeFactories () {
86
179
return ZipImporterBuiltinsFactory .getFactories ();
@@ -116,46 +209,83 @@ private void initZipImporter(PZipImporter self, String path) {
116
209
prefix = tfile .getName () + PZipImporter .SEPARATOR + prefix ;
117
210
tfile = parentFile ;
118
211
}
119
- ZipInputStream zis = null ;
212
+
120
213
if (tfile .exists () && tfile .isRegularFile ()) {
121
- try {
122
- zis = new ZipInputStream (tfile .newInputStream (StandardOpenOption .READ ));
123
- Object files = self .getZipDirectoryCache ().getItem (path );
124
- if (files == null ) {
125
- // fill the cache
126
- PDict filesDict = factory ().createDict ();
214
+ Object files = self .getZipDirectoryCache ().getItem (path );
215
+ if (files == null ) {
216
+ // fill the cache
217
+ PDict filesDict = factory ().createDict ();
218
+ ZipInputStream zis = null ;
219
+ LOCZipEntryStream locis = null ;
220
+ try {
221
+ locis = new LOCZipEntryStream (tfile .newInputStream (StandardOpenOption .READ ));
222
+ locis .findFirstEntryPosition (); // find location of the first zip entry
223
+ zis = new ZipInputStream (locis ); // and create new ZipInput stream from this
224
+ // location
127
225
ZipEntry entry ;
128
226
227
+ // help variable to handle case when there LOC is in content of a file
228
+ long lastZipEntryCSize = 0 ;
229
+ long lastZipEntryPos = 0 ;
230
+ int lastZipLocFileHeaderSize = 0 ;
231
+ long zipEntryPos ;
232
+
233
+ byte [] extraField ;
129
234
while ((entry = zis .getNextEntry ()) != null ) {
235
+ zipEntryPos = locis .positions .remove (0 );
236
+ // handles situation when the local file signature is
237
+ // in the content of a file
238
+ while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos ) {
239
+ zipEntryPos = locis .positions .remove (0 );
240
+ }
241
+
130
242
PTuple tuple = factory ().createTuple (new Object []{
131
243
tfile .getPath () + PZipImporter .SEPARATOR + entry .getName (),
132
244
// for our implementation currently we don't need these
133
245
// these properties to store there. Keeping them for
134
246
// compatibility.
135
247
entry .getMethod (),
136
- entry .getCompressedSize (),
248
+ lastZipEntryCSize = entry .getCompressedSize (),
137
249
entry .getSize (),
138
250
entry .getLastModifiedTime ().toMillis (),
139
- entry .getCrc ()});
251
+ entry .getCrc (),
252
+ // store the entry position for faster reading content
253
+ lastZipEntryPos = zipEntryPos
254
+ });
140
255
filesDict .setItem (entry .getName (), tuple );
256
+ // count local file header from the last zipentry
257
+ lastZipLocFileHeaderSize = 30 + entry .getName ().length ();
258
+ extraField = entry .getExtra ();
259
+ if (extraField != null ) {
260
+ lastZipLocFileHeaderSize += extraField .length ;
261
+ }
141
262
}
142
- files = filesDict ;
143
- self .getZipDirectoryCache ().setItem (path , files );
144
- }
145
- self .setArchive (archive );
146
- self .setPrefix (prefix );
147
- self .setFiles ((PDict ) files );
148
- } catch (IOException ex ) {
149
- throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
150
- } finally {
151
- if (zis != null ) {
152
- try {
153
- zis .close ();
154
- } catch (IOException e ) {
155
- // just ignore it.
263
+ } catch (IOException ex ) {
264
+ throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
265
+ } finally {
266
+ if (zis != null ) {
267
+ try {
268
+ zis .close ();
269
+ } catch (IOException e ) {
270
+ // just ignore it.
271
+ }
272
+ } else {
273
+ if (locis != null ) {
274
+ try {
275
+ locis .close ();
276
+ } catch (IOException e ) {
277
+ // just ignore it.
278
+ }
279
+ }
156
280
}
157
281
}
282
+ files = filesDict ;
283
+ self .getZipDirectoryCache ().setItem (path , files );
158
284
}
285
+ self .setArchive (archive );
286
+ self .setPrefix (prefix );
287
+ self .setFiles ((PDict ) files );
288
+
159
289
} else {
160
290
throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
161
291
}
@@ -302,16 +432,6 @@ public static GetCodeNode create() {
302
432
@ GenerateNodeFactory
303
433
public abstract static class GetDataNode extends PythonBinaryBuiltinNode {
304
434
305
- private static ZipInputStream getEntryIS (InputStream fileStream , String entryName ) throws IOException {
306
- ZipInputStream zis = new ZipInputStream (fileStream );
307
- for (ZipEntry entry ; (entry = zis .getNextEntry ()) != null ;) {
308
- if (entry .getName ().equals (entryName )) {
309
- return zis ;
310
- }
311
- }
312
- throw new IOException ("Cannot find " + entryName );
313
- }
314
-
315
435
@ Specialization
316
436
@ CompilerDirectives .TruffleBoundary
317
437
public PBytes doit (PZipImporter self , String pathname ) {
@@ -336,10 +456,18 @@ public PBytes doit(PZipImporter self, String pathname) {
336
456
if (fileSize < 0 ) {
337
457
throw raise (PythonErrorType .ZipImportError , "negative data size" );
338
458
}
459
+ long streamPosition = (long ) tocEntry .getArray ()[6 ];
339
460
ZipInputStream zis = null ;
340
461
TruffleFile tfile = getContext ().getEnv ().getTruffleFile (archive );
341
462
try {
342
- zis = getEntryIS (tfile .newInputStream (StandardOpenOption .READ ), key );
463
+ InputStream in = tfile .newInputStream (StandardOpenOption .READ );
464
+ in .skip (streamPosition ); // we can fast skip bytes, because there is cached position
465
+ // of the zip entry
466
+ zis = new ZipInputStream (in );
467
+ ZipEntry entry = zis .getNextEntry ();
468
+ if (entry == null || !entry .getName ().equals (key )) {
469
+ throw raise (PythonErrorType .ZipImportError , "zipimport: wrong cached file position" );
470
+ }
343
471
int byteSize = (int ) fileSize ;
344
472
if (byteSize != fileSize ) {
345
473
throw raise (PythonErrorType .ZipImportError , "zipimport: cannot read archive members large than 2GB" );
0 commit comments