Skip to content

Commit cc98f60

Browse files
committed
[GR-14380] zipimporter uses the JDK file system instead of the Truffle file system.
PullRequest: graalpython/434
2 parents f248f1e + 533dbb9 commit cc98f60

File tree

3 files changed

+231
-59
lines changed

3 files changed

+231
-59
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_zipimport.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, Oracle and/or its affiliates.
1+
# Copyright (c) 2018, 2019, Oracle and/or its affiliates.
22
# Copyright (C) 1996-2017 Python Software Foundation
33
#
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
@@ -24,9 +24,12 @@ def get_file():
2424
"""
2525

2626
ZIP_FILE_NAME = 'testzipfile.zip'
27+
EGG_FILE_NAME = 'testeggfile.egg'
2728
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
2829
ZIP_PATH = os.path.join(DIR_PATH, ZIP_FILE_NAME)
2930
ZIP_ABS_PATH = os.path.abspath(ZIP_PATH);
31+
EGG_PATH = os.path.join(DIR_PATH, EGG_FILE_NAME)
32+
EGG_ABS_PATH = os.path.abspath(EGG_PATH);
3033

3134
class ZipImportBaseTestCase(unittest.TestCase):
3235

@@ -176,3 +179,26 @@ def test_module_import(self):
176179
self.assertTrue (m.get_file() == ZIP_ABS_PATH + "/MyTestModule.py")
177180
p = importlib.import_module("packageA.moduleC")
178181
self.assertTrue (p.get_file() == ZIP_ABS_PATH + "/packageA/moduleC.py")
182+
183+
class BasicEggImportTests(ZipImportBaseTestCase):
184+
185+
def setUp(self):
186+
ZipImportBaseTestCase.setUp(self)
187+
self.z = zipimport.zipimporter(EGG_PATH)
188+
189+
def test_zipimporter_egg(self):
190+
self.assertTrue(self.z.prefix == "")
191+
self.assertTrue(self.z.archive == EGG_ABS_PATH)
192+
self.assertTrue(type(self.z._files) is dict)
193+
self.assertTrue(self.z._files["data.bin"] is not None)
194+
self.assertTrue(self.z._files["read.me"] is not None)
195+
196+
def test_egg_get_data(self):
197+
data = self.z.get_data("data.bin")
198+
self.assertTrue(type(data) is bytes)
199+
self.assertEqual(bytes(b'ahojPK\003\004ahoj'), data)
200+
201+
def test_egg_get_readme(self):
202+
data = self.z.get_data("read.me")
203+
self.assertTrue(type(data) is bytes)
204+
self.assertEqual(bytes(b'Pokus\n'), data)
Binary file not shown.

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/zipimporter/ZipImporterBuiltins.java

Lines changed: 204 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
import com.oracle.graal.python.runtime.sequence.storage.SequenceStorage;
6161
import com.oracle.truffle.api.CompilerDirectives;
6262
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
63+
import com.oracle.truffle.api.TruffleFile;
6364
import com.oracle.truffle.api.dsl.Cached;
6465
import com.oracle.truffle.api.dsl.Fallback;
6566
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
@@ -70,16 +71,112 @@
7071
import java.io.File;
7172
import java.io.IOException;
7273
import java.io.InputStream;
73-
import java.util.Enumeration;
74+
import java.nio.file.StandardOpenOption;
75+
import java.util.ArrayList;
7476
import java.util.List;
7577
import java.util.zip.ZipEntry;
76-
import java.util.zip.ZipFile;
78+
import java.util.zip.ZipInputStream;
7779

7880
@CoreFunctions(extendClasses = PythonBuiltinClassType.PZipImporter)
7981
public class ZipImporterBuiltins extends PythonBuiltins {
8082

8183
private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called";
8284

85+
/**
86+
* This stream is need to find locations of zip entries in the zip file. The main purpose of
87+
* this is to find location of the first local file header in the zipfile, which doesn't have to
88+
* be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
89+
* signature `PK\003\004` but with a code, that should be executed.
90+
*
91+
* In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
92+
* location signature.
93+
*
94+
* This stream also improve performance of unzipping files in ZipImporter case. A content of
95+
* file is obtained from the zip, when it's needed (imported). The locations of zip entry
96+
* positions are cached in the zip directory cache. When content of a file is needed, then
97+
* previous zip entries are skipped and ZipInputStream is created from the required position.
98+
*
99+
* New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
100+
*
101+
* It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
102+
* file. This situation has to be handled separately.
103+
*/
104+
private static class LOCZipEntryStream extends InputStream {
105+
// states of the simple lexer
106+
private static final byte AFTER_P = 1;
107+
private static final byte AFTER_PK = 2;
108+
private static final byte AFTER_PK3 = 3;
109+
private static final byte BEFORE_P = 0;
110+
111+
private byte state = BEFORE_P; // the default state
112+
private static final byte[] LOC_SIG = new byte[]{80, 75, 3, 4}; // zip location signature
113+
114+
private final InputStream in;
115+
long pos = 0; // position in the input stream
116+
private boolean readFirstLoc; // is the first location detected?
117+
List<Long> positions; // store the locations
118+
119+
public LOCZipEntryStream(InputStream in) {
120+
this.readFirstLoc = false;
121+
this.positions = new ArrayList<>();
122+
this.in = in;
123+
}
124+
125+
@Override
126+
public int read() throws IOException {
127+
if (readFirstLoc) {
128+
// This expect that the bytes of the first LOC was consumed by this stream
129+
// (due to calling findFirstEntryPosition) and now the stream
130+
// has to push back the LOC bytes
131+
int index = (int) (pos - positions.get(0));
132+
if (index < LOC_SIG.length) {
133+
pos++;
134+
return LOC_SIG[index];
135+
}
136+
readFirstLoc = false; // never do it again
137+
}
138+
int ch = in.read();
139+
pos++;
140+
switch (state) {
141+
case BEFORE_P:
142+
if (ch == LOC_SIG[0]) {
143+
state = AFTER_P;
144+
}
145+
break;
146+
case AFTER_P:
147+
if (ch == LOC_SIG[1]) {
148+
state = AFTER_PK;
149+
} else {
150+
state = BEFORE_P;
151+
}
152+
break;
153+
case AFTER_PK:
154+
if (ch == LOC_SIG[2]) {
155+
state = AFTER_PK3;
156+
} else {
157+
state = BEFORE_P;
158+
}
159+
break;
160+
case AFTER_PK3:
161+
if (ch == LOC_SIG[3]) {
162+
positions.add(pos - 4); // store the LOC position
163+
}
164+
state = BEFORE_P;
165+
}
166+
return ch;
167+
}
168+
169+
void findFirstEntryPosition() throws IOException {
170+
while (positions.isEmpty() && read() != -1) {
171+
// do nothing here, just read until the first LOC is found
172+
}
173+
if (!positions.isEmpty()) {
174+
pos -= 4;
175+
readFirstLoc = true;
176+
}
177+
}
178+
}
179+
83180
@Override
84181
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
85182
return ZipImporterBuiltinsFactory.getFactories();
@@ -99,65 +196,107 @@ private void initZipImporter(PZipImporter self, String path) {
99196
throw raise(PythonErrorType.ZipImportError, "archive path is empty");
100197
}
101198

102-
File file = new File(path);
199+
TruffleFile tfile = getContext().getEnv().getTruffleFile(path);
103200
String prefix = "";
104201
String archive = "";
105202
while (true) {
106-
File fullPathFile = new File(file.getPath());
107-
try {
108-
if (fullPathFile.isFile()) {
109-
archive = file.getPath();
110-
break;
111-
}
112-
} catch (SecurityException se) {
113-
// continue
203+
if (tfile.isRegularFile()) {
204+
// we don't have to store absolute path
205+
archive = tfile.getPath();
206+
break;
114207
}
115-
116-
// back up one path element
117-
File parentFile = file.getParentFile();
208+
TruffleFile parentFile = tfile.getParent();
118209
if (parentFile == null) {
119210
break;
120211
}
121-
122-
prefix = file.getName() + PZipImporter.SEPARATOR + prefix;
123-
file = parentFile;
212+
prefix = tfile.getName() + PZipImporter.SEPARATOR + prefix;
213+
tfile = parentFile;
124214
}
125-
ZipFile zipFile = null;
126215

127-
if (file.exists() && file.isFile()) {
128-
try {
129-
zipFile = new ZipFile(file);
130-
} catch (IOException e) {
131-
throw raise(PythonErrorType.ZipImportError, "not a Zip file");
132-
}
133-
}
134-
if (zipFile == null) {
135-
throw raise(PythonErrorType.ZipImportError, "not a Zip file");
136-
}
137-
Object files = self.getZipDirectoryCache().getItem(path);
138-
if (files == null) {
139-
PDict filesDict = factory().createDict();
140-
Enumeration<? extends ZipEntry> entries = zipFile.entries();
141-
while (entries.hasMoreElements()) {
142-
ZipEntry entry = entries.nextElement();
143-
PTuple tuple = factory().createTuple(new Object[]{
144-
zipFile.getName() + PZipImporter.SEPARATOR + entry.getName(),
145-
// for our implementation currently we don't need these
146-
// these properties to store there. Keeping them for
147-
// compatibility.
148-
entry.getMethod(),
149-
entry.getCompressedSize(),
150-
entry.getSize(),
151-
entry.getLastModifiedTime().toMillis(),
152-
entry.getCrc()});
153-
filesDict.setItem(entry.getName(), tuple);
216+
if (tfile.exists() && tfile.isRegularFile()) {
217+
Object files = self.getZipDirectoryCache().getItem(path);
218+
if (files == null) {
219+
// fill the cache
220+
PDict filesDict = factory().createDict();
221+
ZipInputStream zis = null;
222+
LOCZipEntryStream locis = null;
223+
try {
224+
locis = new LOCZipEntryStream(tfile.newInputStream(StandardOpenOption.READ));
225+
locis.findFirstEntryPosition(); // find location of the first zip entry
226+
if (locis.positions.isEmpty()) {
227+
// no PK\003\004 found -> not a correct zip file
228+
throw raise(PythonErrorType.ZipImportError, "not a Zip file: '%s'", archive);
229+
}
230+
zis = new ZipInputStream(locis); // and create new ZipInput stream from this
231+
// location
232+
ZipEntry entry;
233+
234+
// help variable to handle case when there LOC is in content of a file
235+
long lastZipEntryCSize = 0;
236+
long lastZipEntryPos = 0;
237+
int lastZipLocFileHeaderSize = 0;
238+
long zipEntryPos;
239+
240+
byte[] extraField;
241+
while ((entry = zis.getNextEntry()) != null) {
242+
zipEntryPos = locis.positions.remove(0);
243+
// handles situation when the local file signature is
244+
// in the content of a file
245+
while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos) {
246+
zipEntryPos = locis.positions.remove(0);
247+
}
248+
249+
PTuple tuple = factory().createTuple(new Object[]{
250+
tfile.getPath() + PZipImporter.SEPARATOR + entry.getName(),
251+
// for our implementation currently we don't need these
252+
// these properties to store there. Keeping them for
253+
// compatibility.
254+
entry.getMethod(),
255+
lastZipEntryCSize = entry.getCompressedSize(),
256+
entry.getSize(),
257+
entry.getLastModifiedTime().toMillis(),
258+
entry.getCrc(),
259+
// store the entry position for faster reading content
260+
lastZipEntryPos = zipEntryPos
261+
});
262+
filesDict.setItem(entry.getName(), tuple);
263+
// count local file header from the last zipentry
264+
lastZipLocFileHeaderSize = 30 + entry.getName().length();
265+
extraField = entry.getExtra();
266+
if (extraField != null) {
267+
lastZipLocFileHeaderSize += extraField.length;
268+
}
269+
}
270+
} catch (IOException ex) {
271+
throw raise(PythonErrorType.ZipImportError, "not a Zip file: '%s'", archive);
272+
} finally {
273+
if (zis != null) {
274+
try {
275+
zis.close();
276+
} catch (IOException e) {
277+
// just ignore it.
278+
}
279+
} else {
280+
if (locis != null) {
281+
try {
282+
locis.close();
283+
} catch (IOException e) {
284+
// just ignore it.
285+
}
286+
}
287+
}
288+
}
289+
files = filesDict;
290+
self.getZipDirectoryCache().setItem(path, files);
154291
}
155-
files = filesDict;
156-
self.getZipDirectoryCache().setItem(path, files);
292+
self.setArchive(archive);
293+
self.setPrefix(prefix);
294+
self.setFiles((PDict) files);
295+
296+
} else {
297+
throw raise(PythonErrorType.ZipImportError, "not a Zip file: '%s'", archive);
157298
}
158-
self.setArchive(archive);
159-
self.setPrefix(prefix);
160-
self.setFiles((PDict) files);
299+
161300
}
162301

163302
@Specialization
@@ -324,28 +463,35 @@ public PBytes doit(PZipImporter self, String pathname) {
324463
if (fileSize < 0) {
325464
throw raise(PythonErrorType.ZipImportError, "negative data size");
326465
}
327-
ZipFile zip = null;
466+
long streamPosition = (long) tocEntry.getArray()[6];
467+
ZipInputStream zis = null;
468+
TruffleFile tfile = getContext().getEnv().getTruffleFile(archive);
328469
try {
329-
zip = new ZipFile(archive);
330-
ZipEntry entry = zip.getEntry(key);
331-
InputStream in = zip.getInputStream(entry);
470+
InputStream in = tfile.newInputStream(StandardOpenOption.READ);
471+
in.skip(streamPosition); // we can fast skip bytes, because there is cached position
472+
// of the zip entry
473+
zis = new ZipInputStream(in);
474+
ZipEntry entry = zis.getNextEntry();
475+
if (entry == null || !entry.getName().equals(key)) {
476+
throw raise(PythonErrorType.ZipImportError, "zipimport: wrong cached file position");
477+
}
332478
int byteSize = (int) fileSize;
333479
if (byteSize != fileSize) {
334480
throw raise(PythonErrorType.ZipImportError, "zipimport: cannot read archive members large than 2GB");
335481
}
336482
byte[] bytes = new byte[byteSize];
337483
int bytesRead = 0;
338484
while (bytesRead < byteSize) {
339-
bytesRead += in.read(bytes, bytesRead, byteSize - bytesRead);
485+
bytesRead += zis.read(bytes, bytesRead, byteSize - bytesRead);
340486
}
341-
in.close();
487+
zis.close();
342488
return factory().createBytes(bytes);
343489
} catch (IOException e) {
344490
throw raise(PythonErrorType.ZipImportError, "zipimport: can't read data");
345491
} finally {
346-
if (zip != null) {
492+
if (zis != null) {
347493
try {
348-
zip.close();
494+
zis.close();
349495
} catch (IOException e) {
350496
// just ignore it.
351497
}

0 commit comments

Comments
 (0)