Skip to content

Commit 75b1c7c

Browse files
author
Olivier Chédru
authored
Merge pull request #68 from rzymek/inmem-zipfile
drastically improve reader memory consumption when using InputStream
2 parents b0a3cce + eb53158 commit 75b1c7c

File tree

2 files changed

+38
-25
lines changed

2 files changed

+38
-25
lines changed

fastexcel-reader/src/main/java/org/dhatim/fastexcel/reader/ReadableWorkbook.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,18 @@
2727
import java.util.stream.StreamSupport;
2828
import javax.xml.stream.XMLInputFactory;
2929
import javax.xml.stream.XMLStreamException;
30+
31+
import org.apache.commons.compress.archivers.zip.ZipFile;
32+
import org.apache.commons.compress.utils.SeekableInMemoryByteChannel;
3033
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
3134
import org.apache.poi.openxml4j.exceptions.NotOfficeXmlFileException;
3235
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
3336
import org.apache.poi.openxml4j.opc.OPCPackage;
3437
import org.apache.poi.openxml4j.opc.PackageAccess;
38+
import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
3539
import org.apache.poi.poifs.common.POIFSConstants;
3640
import org.apache.poi.poifs.storage.HeaderBlockConstants;
41+
import org.apache.poi.util.IOUtils;
3742
import org.apache.poi.util.LittleEndian;
3843
import org.apache.poi.xssf.eventusermodel.XSSFReader;
3944
import org.apache.poi.xssf.model.SharedStringsTable;
@@ -53,7 +58,8 @@ public ReadableWorkbook(File inputFile) throws IOException {
5358
}
5459

5560
/**
56-
* Note: when working with huge sheets (e.g. 500_000 rows) use {@link #ReadableWorkbook(File)}
61+
* Note: will load the whole xlsx file into memory,
62+
* (but will not uncompress it in memory)
5763
*/
5864
public ReadableWorkbook(InputStream inputStream) throws IOException {
5965
this(open(inputStream));
@@ -197,7 +203,9 @@ private static OPCPackage open(File file){
197203

198204
private static OPCPackage open(InputStream in) throws IOException {
199205
try {
200-
return OPCPackage.open(in);
206+
byte[] compressedBytes = IOUtils.toByteArray(in);
207+
ZipFile zipFile = new ZipFile(new SeekableInMemoryByteChannel(compressedBytes));
208+
return OPCPackage.open(new ZipFileZipEntrySource(zipFile));
201209
} catch (InvalidFormatException e) {
202210
throw new ExcelReaderException(e);
203211
}

fastexcel-reader/src/test/java/org/dhatim/fastexcel/reader/MemoryUsageTest.java

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,17 @@
66
import org.apache.poi.xssf.streaming.SXSSFSheet;
77
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
88
import org.junit.jupiter.api.BeforeAll;
9-
import org.junit.jupiter.api.BeforeEach;
109
import org.junit.jupiter.api.Test;
1110

12-
import java.io.File;
13-
import java.io.FileOutputStream;
14-
import java.io.IOException;
15-
import java.io.OutputStream;
11+
import java.io.*;
1612
import java.util.logging.Logger;
1713
import java.util.stream.Stream;
1814

1915
import static org.junit.jupiter.api.Assertions.assertEquals;
2016

2117
public class MemoryUsageTest {
2218
private static final Logger LOG = Logger.getLogger(MemoryUsageTest.class.getName());
23-
private static final int ROWS = 600_000;
19+
private static final int ROWS = 600_001;
2420
private static final int COLS = 200;
2521
private static File testFile = new File("target/memtest" + ROWS + "x" + COLS + ".xlsx");
2622

@@ -48,33 +44,42 @@ public static void generateBig() throws IOException {
4844
LOG.info("Size: " + testFile.length());
4945
}
5046

51-
@BeforeEach
52-
public void disableZipBombDetection() {
47+
@Test
48+
public void readFile() throws Exception {
5349
ZipSecureFileWorkaround.disableZipBombDetection();
50+
try (ReadableWorkbook wb = new ReadableWorkbook(testFile)) {
51+
fastexcelReader(wb);
52+
}
5453
}
5554

5655
@Test
57-
public void read() throws Exception {
58-
try (ReadableWorkbook wb = new ReadableWorkbook(testFile)) {
59-
org.dhatim.fastexcel.reader.Sheet sheet = wb.getFirstSheet();
60-
try (Stream<org.dhatim.fastexcel.reader.Row> rows = sheet.openStream()) {
61-
rows.forEach(r -> {
62-
printProgress("reading", r.getRowNum() - 1);
63-
for (int c = 0; c < r.getCellCount(); c++) {
64-
assertEquals(
65-
valueFor(r.getRowNum() - 1, c),
66-
r.getCell(c).asNumber().doubleValue(),
67-
1e-5);
56+
public void readInputStream() throws Exception {
57+
try (InputStream in = new FileInputStream(testFile);
58+
ReadableWorkbook wb = new ReadableWorkbook(in)
59+
) {
60+
fastexcelReader(wb);
61+
}
62+
}
6863

69-
}
70-
});
71-
}
64+
private void fastexcelReader(ReadableWorkbook wb) throws IOException {
65+
Sheet sheet = wb.getFirstSheet();
66+
try (Stream<Row> rows = sheet.openStream()) {
67+
rows.forEach(r -> {
68+
printProgress("reading", r.getRowNum() - 1);
69+
for (int c = 0; c < r.getCellCount(); c++) {
70+
assertEquals(
71+
valueFor(r.getRowNum() - 1, c),
72+
r.getCell(c).asNumber().doubleValue(),
73+
1e-5);
74+
75+
}
76+
});
7277
}
7378
}
7479

7580
private static void printProgress(String prefix, int r) {
7681
if (r % (ROWS / 100) == 0) {
77-
LOG.info(prefix+": "+(100 * r / ROWS) + "%");
82+
LOG.info(prefix + ": " + (100 * r / ROWS) + "%");
7883
}
7984
}
8085

0 commit comments

Comments
 (0)