Skip to content

Commit cdae4a5

Browse files
committed
Solving problems with ZipInputStream.
1 parent 3fccb7a commit cdae4a5

File tree

3 files changed

+188
-34
lines changed

3 files changed

+188
-34
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_zipimport.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,12 @@ def get_file():
2424
"""
2525

2626
ZIP_FILE_NAME = 'testzipfile.zip'
27+
EGG_FILE_NAME = 'testeggfile.egg'
2728
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
2829
ZIP_PATH = os.path.join(DIR_PATH, ZIP_FILE_NAME)
2930
ZIP_ABS_PATH = os.path.abspath(ZIP_PATH);
31+
EGG_PATH = os.path.join(DIR_PATH, EGG_FILE_NAME)
32+
EGG_ABS_PATH = os.path.abspath(EGG_PATH);
3033

3134
class ZipImportBaseTestCase(unittest.TestCase):
3235

@@ -176,3 +179,26 @@ def test_module_import(self):
176179
self.assertTrue (m.get_file() == ZIP_ABS_PATH + "/MyTestModule.py")
177180
p = importlib.import_module("packageA.moduleC")
178181
self.assertTrue (p.get_file() == ZIP_ABS_PATH + "/packageA/moduleC.py")
182+
183+
class BasicEggImportTests(ZipImportBaseTestCase):
184+
185+
def setUp(self):
186+
ZipImportBaseTestCase.setUp(self)
187+
self.z = zipimport.zipimporter(EGG_PATH)
188+
189+
def test_zipimporter_egg(self):
190+
self.assertTrue(self.z.prefix == "")
191+
self.assertTrue(self.z.archive == EGG_ABS_PATH)
192+
self.assertTrue(type(self.z._files) is dict)
193+
self.assertTrue(self.z._files["data.bin"] is not None)
194+
self.assertTrue(self.z._files["read.me"] is not None)
195+
196+
def test_egg_get_data(self):
197+
data = self.z.get_data("data.bin")
198+
self.assertTrue(type(data) is bytes)
199+
self.assertEqual(bytes(b'ahojPK\003\004ahoj'), data)
200+
201+
def test_egg_get_readme(self):
202+
data = self.z.get_data("read.me")
203+
self.assertTrue(type(data) is bytes)
204+
self.assertEqual(bytes(b'Pokus\n'), data)
Binary file not shown.

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/zipimporter/ZipImporterBuiltins.java

Lines changed: 162 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
import java.io.IOException;
7373
import java.io.InputStream;
7474
import java.nio.file.StandardOpenOption;
75+
import java.util.ArrayList;
7576
import java.util.List;
7677
import java.util.zip.ZipEntry;
7778
import java.util.zip.ZipInputStream;
@@ -81,6 +82,98 @@ public class ZipImporterBuiltins extends PythonBuiltins {
8182

8283
private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called";
8384

85+
/**
86+
* This stream is need to find locations of zip entries in the zip file. The main purpose of
87+
* this is to find location of the first local file header in the zipfile, which doesn't have to
88+
* be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
89+
* signature `PK\003\004` but with a code, that should be executed.
90+
*
91+
* In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
92+
* location signature.
93+
*
94+
* This stream also improve performance of unzipping files in ZipImporter case. A content of
95+
* file is obtained from the zip, when it's needed (imported). The locations of zip entry
96+
* positions are cached in the zip directory cache. When content of a file is needed, then
97+
* previous zip entries are skipped and ZipInputStream is created from the required position.
98+
*
99+
* New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
100+
*
101+
* It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
102+
* file. This situation has to be handled separately.
103+
*/
104+
private static class LOCZipEntryStream extends InputStream {
105+
// states of the simple lexer
106+
private static final byte AFTER_P = 1;
107+
private static final byte AFTER_PK = 2;
108+
private static final byte AFTER_PK3 = 3;
109+
private static final byte BEFORE_P = 0;
110+
111+
private byte state = BEFORE_P; // the default state
112+
private static final byte[] LOC_SIG = new byte[]{80, 75, 3, 4}; // zip location signature
113+
114+
private final InputStream in;
115+
long pos = 0; // position in the input stream
116+
private boolean readFirstLoc; // is the first location detected?
117+
List<Long> positions; // store the locations
118+
119+
public LOCZipEntryStream(InputStream in) {
120+
this.readFirstLoc = false;
121+
this.positions = new ArrayList<>();
122+
this.in = in;
123+
}
124+
125+
@Override
126+
public int read() throws IOException {
127+
if (readFirstLoc) {
128+
// This expect that the bytes of the first LOC was consumed by this stream
129+
// (due to calling findFirstEntryPosition) and now the stream
130+
// has to push back the LOC bytes
131+
int index = (int) (pos - positions.get(0));
132+
if (index < LOC_SIG.length) {
133+
pos++;
134+
return LOC_SIG[index];
135+
}
136+
readFirstLoc = false; // never do it again
137+
}
138+
int ch = in.read();
139+
pos++;
140+
switch (state) {
141+
case BEFORE_P:
142+
if (ch == LOC_SIG[0]) {
143+
state = AFTER_P;
144+
}
145+
break;
146+
case AFTER_P:
147+
if (ch == LOC_SIG[1]) {
148+
state = AFTER_PK;
149+
} else {
150+
state = BEFORE_P;
151+
}
152+
break;
153+
case AFTER_PK:
154+
if (ch == LOC_SIG[2]) {
155+
state = AFTER_PK3;
156+
} else {
157+
state = BEFORE_P;
158+
}
159+
break;
160+
case AFTER_PK3:
161+
if (ch == LOC_SIG[3]) {
162+
positions.add(pos - 4); // store the LOC position
163+
}
164+
state = BEFORE_P;
165+
}
166+
return ch;
167+
}
168+
169+
void findFirstEntryPosition() throws IOException {
170+
while (positions.isEmpty() && read() != -1) {
171+
}
172+
pos -= 4;
173+
readFirstLoc = true;
174+
}
175+
}
176+
84177
@Override
85178
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
86179
return ZipImporterBuiltinsFactory.getFactories();
@@ -116,46 +209,83 @@ private void initZipImporter(PZipImporter self, String path) {
116209
prefix = tfile.getName() + PZipImporter.SEPARATOR + prefix;
117210
tfile = parentFile;
118211
}
119-
ZipInputStream zis = null;
212+
120213
if (tfile.exists() && tfile.isRegularFile()) {
121-
try {
122-
zis = new ZipInputStream(tfile.newInputStream(StandardOpenOption.READ));
123-
Object files = self.getZipDirectoryCache().getItem(path);
124-
if (files == null) {
125-
// fill the cache
126-
PDict filesDict = factory().createDict();
214+
Object files = self.getZipDirectoryCache().getItem(path);
215+
if (files == null) {
216+
// fill the cache
217+
PDict filesDict = factory().createDict();
218+
ZipInputStream zis = null;
219+
LOCZipEntryStream locis = null;
220+
try {
221+
locis = new LOCZipEntryStream(tfile.newInputStream(StandardOpenOption.READ));
222+
locis.findFirstEntryPosition(); // find location of the first zip entry
223+
zis = new ZipInputStream(locis); // and create new ZipInput stream from this
224+
// location
127225
ZipEntry entry;
128226

227+
// help variable to handle case when there LOC is in content of a file
228+
long lastZipEntryCSize = 0;
229+
long lastZipEntryPos = 0;
230+
int lastZipLocFileHeaderSize = 0;
231+
long zipEntryPos;
232+
233+
byte[] extraField;
129234
while ((entry = zis.getNextEntry()) != null) {
235+
zipEntryPos = locis.positions.remove(0);
236+
// handles situation when the local file signature is
237+
// in the content of a file
238+
while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos) {
239+
zipEntryPos = locis.positions.remove(0);
240+
}
241+
130242
PTuple tuple = factory().createTuple(new Object[]{
131243
tfile.getPath() + PZipImporter.SEPARATOR + entry.getName(),
132244
// for our implementation currently we don't need these
133245
// these properties to store there. Keeping them for
134246
// compatibility.
135247
entry.getMethod(),
136-
entry.getCompressedSize(),
248+
lastZipEntryCSize = entry.getCompressedSize(),
137249
entry.getSize(),
138250
entry.getLastModifiedTime().toMillis(),
139-
entry.getCrc()});
251+
entry.getCrc(),
252+
// store the entry position for faster reading content
253+
lastZipEntryPos = zipEntryPos
254+
});
140255
filesDict.setItem(entry.getName(), tuple);
256+
// count local file header from the last zipentry
257+
lastZipLocFileHeaderSize = 30 + entry.getName().length();
258+
extraField = entry.getExtra();
259+
if (extraField != null) {
260+
lastZipLocFileHeaderSize += extraField.length;
261+
}
141262
}
142-
files = filesDict;
143-
self.getZipDirectoryCache().setItem(path, files);
144-
}
145-
self.setArchive(archive);
146-
self.setPrefix(prefix);
147-
self.setFiles((PDict) files);
148-
} catch (IOException ex) {
149-
throw raise(PythonErrorType.ZipImportError, "not a Zip file");
150-
} finally {
151-
if (zis != null) {
152-
try {
153-
zis.close();
154-
} catch (IOException e) {
155-
// just ignore it.
263+
} catch (IOException ex) {
264+
throw raise(PythonErrorType.ZipImportError, "not a Zip file");
265+
} finally {
266+
if (zis != null) {
267+
try {
268+
zis.close();
269+
} catch (IOException e) {
270+
// just ignore it.
271+
}
272+
} else {
273+
if (locis != null) {
274+
try {
275+
locis.close();
276+
} catch (IOException e) {
277+
// just ignore it.
278+
}
279+
}
156280
}
157281
}
282+
files = filesDict;
283+
self.getZipDirectoryCache().setItem(path, files);
158284
}
285+
self.setArchive(archive);
286+
self.setPrefix(prefix);
287+
self.setFiles((PDict) files);
288+
159289
} else {
160290
throw raise(PythonErrorType.ZipImportError, "not a Zip file");
161291
}
@@ -302,16 +432,6 @@ public static GetCodeNode create() {
302432
@GenerateNodeFactory
303433
public abstract static class GetDataNode extends PythonBinaryBuiltinNode {
304434

305-
private static ZipInputStream getEntryIS(InputStream fileStream, String entryName) throws IOException {
306-
ZipInputStream zis = new ZipInputStream(fileStream);
307-
for (ZipEntry entry; (entry = zis.getNextEntry()) != null;) {
308-
if (entry.getName().equals(entryName)) {
309-
return zis;
310-
}
311-
}
312-
throw new IOException("Cannot find " + entryName);
313-
}
314-
315435
@Specialization
316436
@CompilerDirectives.TruffleBoundary
317437
public PBytes doit(PZipImporter self, String pathname) {
@@ -336,10 +456,18 @@ public PBytes doit(PZipImporter self, String pathname) {
336456
if (fileSize < 0) {
337457
throw raise(PythonErrorType.ZipImportError, "negative data size");
338458
}
459+
long streamPosition = (long) tocEntry.getArray()[6];
339460
ZipInputStream zis = null;
340461
TruffleFile tfile = getContext().getEnv().getTruffleFile(archive);
341462
try {
342-
zis = getEntryIS(tfile.newInputStream(StandardOpenOption.READ), key);
463+
InputStream in = tfile.newInputStream(StandardOpenOption.READ);
464+
in.skip(streamPosition); // we can fast skip bytes, because there is cached position
465+
// of the zip entry
466+
zis = new ZipInputStream(in);
467+
ZipEntry entry = zis.getNextEntry();
468+
if (entry == null || !entry.getName().equals(key)) {
469+
throw raise(PythonErrorType.ZipImportError, "zipimport: wrong cached file position");
470+
}
343471
int byteSize = (int) fileSize;
344472
if (byteSize != fileSize) {
345473
throw raise(PythonErrorType.ZipImportError, "zipimport: cannot read archive members large than 2GB");

0 commit comments

Comments
 (0)