Solving problems with ZipInputStream.

ppisl · ppisl · commit cdae4a503a4c · 2019-03-18T13:23:53.000+01:00
diff --git a/graalpython/com.oracle.graal.python.test/src/tests/test_zipimport.py b/graalpython/com.oracle.graal.python.test/src/tests/test_zipimport.py
@@ -24,9 +24,12 @@ def get_file():
 """
 
 ZIP_FILE_NAME = 'testzipfile.zip'
+EGG_FILE_NAME = 'testeggfile.egg'
 DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 ZIP_PATH = os.path.join(DIR_PATH, ZIP_FILE_NAME)
 ZIP_ABS_PATH = os.path.abspath(ZIP_PATH);
+EGG_PATH = os.path.join(DIR_PATH, EGG_FILE_NAME)
+EGG_ABS_PATH = os.path.abspath(EGG_PATH);
 
 class ZipImportBaseTestCase(unittest.TestCase):
 
@@ -176,3 +179,26 @@ def test_module_import(self):
         self.assertTrue (m.get_file() == ZIP_ABS_PATH + "/MyTestModule.py")
         p = importlib.import_module("packageA.moduleC")
         self.assertTrue (p.get_file() == ZIP_ABS_PATH + "/packageA/moduleC.py")
+
+class BasicEggImportTests(ZipImportBaseTestCase):
+
+    def setUp(self):
+        ZipImportBaseTestCase.setUp(self)
+        self.z = zipimport.zipimporter(EGG_PATH)
+
+    def test_zipimporter_egg(self):
+        self.assertTrue(self.z.prefix == "")
+        self.assertTrue(self.z.archive == EGG_ABS_PATH)
+        self.assertTrue(type(self.z._files) is dict)
+        self.assertTrue(self.z._files["data.bin"] is not None)
+        self.assertTrue(self.z._files["read.me"] is not None)
+        
+    def test_egg_get_data(self):
+        data = self.z.get_data("data.bin")
+        self.assertTrue(type(data) is bytes)
+        self.assertEqual(bytes(b'ahojPK\003\004ahoj'), data)
+    
+    def test_egg_get_readme(self):
+        data = self.z.get_data("read.me")
+        self.assertTrue(type(data) is bytes)
+        self.assertEqual(bytes(b'Pokus\n'), data)
diff --git a/graalpython/com.oracle.graal.python.test/src/tests/testeggfile.egg b/graalpython/com.oracle.graal.python.test/src/tests/testeggfile.egg
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/zipimporter/ZipImporterBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/zipimporter/ZipImporterBuiltins.java
@@ -72,6 +72,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
@@ -81,6 +82,98 @@ public class ZipImporterBuiltins extends PythonBuiltins {
 
     private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called";
 
+    /**
+     * This stream is need to find locations of zip entries in the zip file. The main purpose of
+     * this is to find location of the first local file header in the zipfile, which doesn't have to
+     * be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
+     * signature `PK\003\004` but with a code, that should be executed.
+     * 
+     * In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
+     * location signature.
+     * 
+     * This stream also improve performance of unzipping files in ZipImporter case. A content of
+     * file is obtained from the zip, when it's needed (imported). The locations of zip entry
+     * positions are cached in the zip directory cache. When content of a file is needed, then
+     * previous zip entries are skipped and ZipInputStream is created from the required position.
+     * 
+     * New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
+     * 
+     * It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
+     * file. This situation has to be handled separately.
+     */
+    private static class LOCZipEntryStream extends InputStream {
+        // states of the simple lexer
+        private static final byte AFTER_P = 1;
+        private static final byte AFTER_PK = 2;
+        private static final byte AFTER_PK3 = 3;
+        private static final byte BEFORE_P = 0;
+
+        private byte state = BEFORE_P;  // the default state
+        private static final byte[] LOC_SIG = new byte[]{80, 75, 3, 4}; // zip location signature
+
+        private final InputStream in;
+        long pos = 0;                  // position in the input stream
+        private boolean readFirstLoc;  // is the first location detected?
+        List<Long> positions;          // store the locations
+
+        public LOCZipEntryStream(InputStream in) {
+            this.readFirstLoc = false;
+            this.positions = new ArrayList<>();
+            this.in = in;
+        }
+
+        @Override
+        public int read() throws IOException {
+            if (readFirstLoc) {
+                // This expect that the bytes of the first LOC was consumed by this stream
+                // (due to calling findFirstEntryPosition) and now the stream
+                // has to push back the LOC bytes
+                int index = (int) (pos - positions.get(0));
+                if (index < LOC_SIG.length) {
+                    pos++;
+                    return LOC_SIG[index];
+                }
+                readFirstLoc = false;  // never do it again
+            }
+            int ch = in.read();
+            pos++;
+            switch (state) {
+                case BEFORE_P:
+                    if (ch == LOC_SIG[0]) {
+                        state = AFTER_P;
+                    }
+                    break;
+                case AFTER_P:
+                    if (ch == LOC_SIG[1]) {
+                        state = AFTER_PK;
+                    } else {
+                        state = BEFORE_P;
+                    }
+                    break;
+                case AFTER_PK:
+                    if (ch == LOC_SIG[2]) {
+                        state = AFTER_PK3;
+                    } else {
+                        state = BEFORE_P;
+                    }
+                    break;
+                case AFTER_PK3:
+                    if (ch == LOC_SIG[3]) {
+                        positions.add(pos - 4);  // store the LOC position
+                    }
+                    state = BEFORE_P;
+            }
+            return ch;
+        }
+
+        void findFirstEntryPosition() throws IOException {
+            while (positions.isEmpty() && read() != -1) {
+            }
+            pos -= 4;
+            readFirstLoc = true;
+        }
+    }
+
     @Override
     protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
         return ZipImporterBuiltinsFactory.getFactories();
@@ -116,46 +209,83 @@ private void initZipImporter(PZipImporter self, String path) {
                 prefix = tfile.getName() + PZipImporter.SEPARATOR + prefix;
                 tfile = parentFile;
             }
-            ZipInputStream zis = null;
+
             if (tfile.exists() && tfile.isRegularFile()) {
-                try {
-                    zis = new ZipInputStream(tfile.newInputStream(StandardOpenOption.READ));
-                    Object files = self.getZipDirectoryCache().getItem(path);
-                    if (files == null) {
-                        // fill the cache
-                        PDict filesDict = factory().createDict();
+                Object files = self.getZipDirectoryCache().getItem(path);
+                if (files == null) {
+                    // fill the cache
+                    PDict filesDict = factory().createDict();
+                    ZipInputStream zis = null;
+                    LOCZipEntryStream locis = null;
+                    try {
+                        locis = new LOCZipEntryStream(tfile.newInputStream(StandardOpenOption.READ));
+                        locis.findFirstEntryPosition(); // find location of the first zip entry
+                        zis = new ZipInputStream(locis); // and create new ZipInput stream from this
+                                                         // location
                         ZipEntry entry;
 
+                        // help variable to handle case when there LOC is in content of a file
+                        long lastZipEntryCSize = 0;
+                        long lastZipEntryPos = 0;
+                        int lastZipLocFileHeaderSize = 0;
+                        long zipEntryPos;
+
+                        byte[] extraField;
                         while ((entry = zis.getNextEntry()) != null) {
+                            zipEntryPos = locis.positions.remove(0);
+                            // handles situation when the local file signature is
+                            // in the content of a file
+                            while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos) {
+                                zipEntryPos = locis.positions.remove(0);
+                            }
+
                             PTuple tuple = factory().createTuple(new Object[]{
                                             tfile.getPath() + PZipImporter.SEPARATOR + entry.getName(),
                                             // for our implementation currently we don't need these
                                             // these properties to store there. Keeping them for
                                             // compatibility.
                                             entry.getMethod(),
-                                            entry.getCompressedSize(),
+                                            lastZipEntryCSize = entry.getCompressedSize(),
                                             entry.getSize(),
                                             entry.getLastModifiedTime().toMillis(),
-                                            entry.getCrc()});
+                                            entry.getCrc(),
+                                            // store the entry position for faster reading content
+                                            lastZipEntryPos = zipEntryPos
+                            });
                             filesDict.setItem(entry.getName(), tuple);
+                            // count local file header from the last zipentry
+                            lastZipLocFileHeaderSize = 30 + entry.getName().length();
+                            extraField = entry.getExtra();
+                            if (extraField != null) {
+                                lastZipLocFileHeaderSize += extraField.length;
+                            }
                         }
-                        files = filesDict;
-                        self.getZipDirectoryCache().setItem(path, files);
-                    }
-                    self.setArchive(archive);
-                    self.setPrefix(prefix);
-                    self.setFiles((PDict) files);
-                } catch (IOException ex) {
-                    throw raise(PythonErrorType.ZipImportError, "not a Zip file");
-                } finally {
-                    if (zis != null) {
-                        try {
-                            zis.close();
-                        } catch (IOException e) {
-                            // just ignore it.
+                    } catch (IOException ex) {
+                        throw raise(PythonErrorType.ZipImportError, "not a Zip file");
+                    } finally {
+                        if (zis != null) {
+                            try {
+                                zis.close();
+                            } catch (IOException e) {
+                                // just ignore it.
+                            }
+                        } else {
+                            if (locis != null) {
+                                try {
+                                    locis.close();
+                                } catch (IOException e) {
+                                    // just ignore it.
+                                }
+                            }
                         }
                     }
+                    files = filesDict;
+                    self.getZipDirectoryCache().setItem(path, files);
                 }
+                self.setArchive(archive);
+                self.setPrefix(prefix);
+                self.setFiles((PDict) files);
+
             } else {
                 throw raise(PythonErrorType.ZipImportError, "not a Zip file");
             }
@@ -302,16 +432,6 @@ public static GetCodeNode create() {
     @GenerateNodeFactory
     public abstract static class GetDataNode extends PythonBinaryBuiltinNode {
 
-        private static ZipInputStream getEntryIS(InputStream fileStream, String entryName) throws IOException {
-            ZipInputStream zis = new ZipInputStream(fileStream);
-            for (ZipEntry entry; (entry = zis.getNextEntry()) != null;) {
-                if (entry.getName().equals(entryName)) {
-                    return zis;
-                }
-            }
-            throw new IOException("Cannot find " + entryName);
-        }
-
         @Specialization
         @CompilerDirectives.TruffleBoundary
         public PBytes doit(PZipImporter self, String pathname) {
@@ -336,10 +456,18 @@ public PBytes doit(PZipImporter self, String pathname) {
             if (fileSize < 0) {
                 throw raise(PythonErrorType.ZipImportError, "negative data size");
             }
+            long streamPosition = (long) tocEntry.getArray()[6];
             ZipInputStream zis = null;
             TruffleFile tfile = getContext().getEnv().getTruffleFile(archive);
             try {
-                zis = getEntryIS(tfile.newInputStream(StandardOpenOption.READ), key);
+                InputStream in = tfile.newInputStream(StandardOpenOption.READ);
+                in.skip(streamPosition); // we can fast skip bytes, because there is cached position
+                                         // of the zip entry
+                zis = new ZipInputStream(in);
+                ZipEntry entry = zis.getNextEntry();
+                if (entry == null || !entry.getName().equals(key)) {
+                    throw raise(PythonErrorType.ZipImportError, "zipimport: wrong cached file position");
+                }
                 int byteSize = (int) fileSize;
                 if (byteSize != fileSize) {
                     throw raise(PythonErrorType.ZipImportError, "zipimport: cannot read archive members large than 2GB");