Skip to content

Commit 47ef664

Browse files
committed
Enable FileCompression to decompress until the end of the input instead of stopping after the first stream.
Allows FileOpener to (transparently) process Blocked GNU Zip Format files.
1 parent b6e740c commit 47ef664

File tree

7 files changed

+89
-16
lines changed

7 files changed

+89
-16
lines changed

metafacture-io/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies {
2121
api project(':metafacture-framework')
2222
implementation project(':metafacture-commons')
2323
implementation 'commons-io:commons-io:2.5'
24-
implementation 'org.apache.commons:commons-compress:1.12'
24+
implementation 'org.apache.commons:commons-compress:1.20'
2525
runtimeOnly 'org.tukaani:xz:1.6'
2626
testImplementation 'junit:junit:4.12'
2727
testImplementation 'org.mockito:mockito-core:2.5.5'

metafacture-io/src/main/java/org/metafacture/io/FileCompression.java

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
4343
}
4444

4545
@Override
46-
public InputStream createDecompressor(final InputStream readFrom) {
46+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
4747
return new ProxyInputStream(readFrom) {
4848
//nothing to do
4949
};
@@ -77,12 +77,14 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
7777
}
7878

7979
@Override
80-
public InputStream createDecompressor(final InputStream readFrom) {
80+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
8181
final InputStream bufferedStream = bufferStream(readFrom);
8282
try {
83-
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(bufferedStream);
83+
return decompressConcatenated ?
84+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream) :
85+
APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream);
8486
} catch (CompressorException e) {
85-
return NONE.createDecompressor(bufferedStream);
87+
return NONE.createDecompressor(bufferedStream, decompressConcatenated);
8688
}
8789
}
8890
},
@@ -99,10 +101,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
99101
}
100102

101103
@Override
102-
public InputStream createDecompressor(final InputStream readFrom) {
104+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
103105
try {
104106
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
105-
CompressorStreamFactory.BZIP2, bufferStream(readFrom));
107+
CompressorStreamFactory.BZIP2, bufferStream(readFrom), decompressConcatenated);
106108
} catch (CompressorException e) {
107109
throw new MetafactureException(e);
108110
}
@@ -121,10 +123,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
121123
}
122124

123125
@Override
124-
public InputStream createDecompressor(final InputStream readFrom) {
126+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
125127
try {
126128
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
127-
CompressorStreamFactory.GZIP, bufferStream(readFrom));
129+
CompressorStreamFactory.GZIP, bufferStream(readFrom), decompressConcatenated);
128130
} catch (CompressorException e) {
129131
throw new MetafactureException(e);
130132
}
@@ -143,10 +145,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
143145
}
144146

145147
@Override
146-
public InputStream createDecompressor(final InputStream readFrom) {
148+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
147149
try {
148150
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
149-
CompressorStreamFactory.PACK200, bufferStream(readFrom));
151+
CompressorStreamFactory.PACK200, bufferStream(readFrom), decompressConcatenated);
150152
} catch (CompressorException e) {
151153
throw new MetafactureException(e);
152154
}
@@ -165,22 +167,32 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
165167
}
166168

167169
@Override
168-
public InputStream createDecompressor(final InputStream readFrom) {
170+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
169171
try {
170172
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
171-
CompressorStreamFactory.XZ, bufferStream(readFrom));
173+
CompressorStreamFactory.XZ, bufferStream(readFrom), decompressConcatenated);
172174
} catch (CompressorException e) {
173175
throw new MetafactureException(e);
174176
}
175177
}
176178
};
177179

178-
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = new CompressorStreamFactory();
180+
public static final boolean DEFAULT_DECOMPRESS_CONCATENATED = false;
181+
182+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(true);
183+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(false);
184+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = DEFAULT_DECOMPRESS_CONCATENATED ?
185+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED : APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED;
186+
179187
private static final int BUFFER_SIZE = 8 * 1024 * 1024;
180188

181189
public abstract OutputStream createCompressor(final OutputStream writeTo, final String fileName);
182190

183-
public abstract InputStream createDecompressor(final InputStream readFrom);
191+
public abstract InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated);
192+
193+
public InputStream createDecompressor(final InputStream readFrom) {
194+
return createDecompressor(readFrom, DEFAULT_DECOMPRESS_CONCATENATED);
195+
}
184196

185197
private static OutputStream bufferStream(final OutputStream stream) {
186198
if (stream instanceof BufferedOutputStream) {

metafacture-io/src/main/java/org/metafacture/io/FileOpener.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public final class FileOpener
4646

4747
private String encoding = "UTF-8";
4848
private FileCompression compression = FileCompression.AUTO;
49+
private boolean decompressConcatenated = FileCompression.DEFAULT_DECOMPRESS_CONCATENATED;
4950

5051
/**
5152
* Returns the encoding used to open the resource.
@@ -78,12 +79,20 @@ public void setCompression(final String compression) {
7879
setCompression(FileCompression.valueOf(compression.toUpperCase()));
7980
}
8081

82+
public boolean getDecompressConcatenated() {
83+
return decompressConcatenated;
84+
}
85+
86+
public void setDecompressConcatenated(final boolean decompressConcatenated) {
87+
this.decompressConcatenated = decompressConcatenated;
88+
}
89+
8190
@Override
8291
public void process(final String file) {
8392
try {
8493
final InputStream fileStream = new FileInputStream(file);
8594
try {
86-
final InputStream decompressor = compression.createDecompressor(fileStream);
95+
final InputStream decompressor = compression.createDecompressor(fileStream, decompressConcatenated);
8796
try {
8897

8998
final Reader reader = new InputStreamReader(new BOMInputStream(

metafacture-io/src/test/java/org/metafacture/io/FileOpenerCompressionTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public FileOpenerCompressionTest(final String resourcePath,
7676
public static Iterable<Object[]> data() {
7777
return Arrays.asList(new Object[][] {
7878
{ "compressed.txt", FileCompression.AUTO },
79+
{ "compressed.txt.bgzf", FileCompression.AUTO },
7980
{ "compressed.txt.bz2", FileCompression.AUTO },
8081
{ "compressed.txt.bzip2", FileCompression.AUTO },
8182
{ "compressed.txt.gz", FileCompression.AUTO },
@@ -84,6 +85,7 @@ public static Iterable<Object[]> data() {
8485
{ "compressed.txt", FileCompression.NONE },
8586
{ "compressed.txt.bz2", FileCompression.BZIP2 },
8687
{ "compressed.txt.bzip2", FileCompression.BZIP2 },
88+
{ "compressed.txt.bgzf", FileCompression.GZIP },
8789
{ "compressed.txt.gz", FileCompression.GZIP },
8890
{ "compressed.txt.gzip", FileCompression.GZIP },
8991
{ "compressed.txt.xz", FileCompression.XZ },

metafacture-io/src/test/java/org/metafacture/io/FileOpenerTest.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,21 @@
1616
package org.metafacture.io;
1717

1818
import static org.junit.Assert.assertEquals;
19+
import static org.junit.Assert.assertTrue;
1920
import static org.junit.Assume.assumeFalse;
2021
import static org.mockito.Mockito.verify;
2122

2223
import java.io.File;
2324
import java.io.FileOutputStream;
2425
import java.io.IOException;
26+
import java.io.InputStream;
27+
import java.io.InputStreamReader;
2528
import java.io.OutputStream;
2629
import java.io.Reader;
2730
import java.nio.charset.Charset;
2831
import java.nio.charset.StandardCharsets;
32+
import java.nio.file.Files;
33+
import java.nio.file.StandardCopyOption;
2934

3035
import org.junit.Rule;
3136
import org.junit.Test;
@@ -77,6 +82,43 @@ public void testUtf8IsDefaultEncoding() throws IOException {
7782
assertEquals(DATA, ResourceUtil.readAll(processedObject.getValue()));
7883
}
7984

85+
@Test
86+
public void testNoDecompressConcatenated() throws IOException {
87+
testDecompressConcatenated(false);
88+
}
89+
90+
@Test
91+
public void testDecompressConcatenated() throws IOException {
92+
testDecompressConcatenated(true);
93+
}
94+
95+
private void testDecompressConcatenated(final boolean decompressConcatenated) throws IOException {
96+
final int maxBytes = (int) Math.pow(2, 16); // BGZF max compressed block size
97+
final StringBuilder sb = new StringBuilder();
98+
99+
try (InputStreamReader r = new InputStreamReader(getClass().getResourceAsStream("compressed.txt"))) {
100+
final String data = ResourceUtil.readAll(r);
101+
for (int i = 0; i < 1525; i++) {
102+
sb.append(data).append("\n");
103+
}
104+
}
105+
106+
final String data = sb.toString();
107+
assertTrue(data.length() + " > " + maxBytes, data.length() > maxBytes);
108+
109+
final File testFile = copyResourceToTempFile("compressed-large.txt.bgzf");
110+
111+
final FileOpener opener = new FileOpener();
112+
opener.setDecompressConcatenated(decompressConcatenated);
113+
opener.setReceiver(receiver);
114+
opener.process(testFile.getAbsolutePath());
115+
opener.closeStream();
116+
117+
verify(receiver).process(processedObject.capture());
118+
assertEquals(decompressConcatenated ? data : data.substring(0, maxBytes),
119+
ResourceUtil.readAll(processedObject.getValue()));
120+
}
121+
80122
private File createTestFile() throws IOException {
81123
final File file = tempFolder.newFile();
82124
try (OutputStream stream = new FileOutputStream(file)) {
@@ -85,4 +127,12 @@ private File createTestFile() throws IOException {
85127
return file;
86128
}
87129

130+
private File copyResourceToTempFile(final String resourcePath) throws IOException {
131+
final File file = tempFolder.newFile();
132+
try (InputStream in = getClass().getResourceAsStream(resourcePath)) {
133+
Files.copy(in, file.toPath(), StandardCopyOption.REPLACE_EXISTING);
134+
}
135+
return file;
136+
}
137+
88138
}
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)