Skip to content

Commit f8f7285

Browse files
authored
Merge pull request #358 from metafacture/file-opener-decompress-concatenated
Optionally decompress concatenated streams.
2 parents b6e740c + 47ef664 commit f8f7285

File tree

7 files changed

+89
-16
lines changed

7 files changed

+89
-16
lines changed

metafacture-io/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies {
2121
api project(':metafacture-framework')
2222
implementation project(':metafacture-commons')
2323
implementation 'commons-io:commons-io:2.5'
24-
implementation 'org.apache.commons:commons-compress:1.12'
24+
implementation 'org.apache.commons:commons-compress:1.20'
2525
runtimeOnly 'org.tukaani:xz:1.6'
2626
testImplementation 'junit:junit:4.12'
2727
testImplementation 'org.mockito:mockito-core:2.5.5'

metafacture-io/src/main/java/org/metafacture/io/FileCompression.java

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
4343
}
4444

4545
@Override
46-
public InputStream createDecompressor(final InputStream readFrom) {
46+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
4747
return new ProxyInputStream(readFrom) {
4848
//nothing to do
4949
};
@@ -77,12 +77,14 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
7777
}
7878

7979
@Override
80-
public InputStream createDecompressor(final InputStream readFrom) {
80+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
8181
final InputStream bufferedStream = bufferStream(readFrom);
8282
try {
83-
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(bufferedStream);
83+
return decompressConcatenated ?
84+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream) :
85+
APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream);
8486
} catch (CompressorException e) {
85-
return NONE.createDecompressor(bufferedStream);
87+
return NONE.createDecompressor(bufferedStream, decompressConcatenated);
8688
}
8789
}
8890
},
@@ -99,10 +101,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
99101
}
100102

101103
@Override
102-
public InputStream createDecompressor(final InputStream readFrom) {
104+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
103105
try {
104106
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
105-
CompressorStreamFactory.BZIP2, bufferStream(readFrom));
107+
CompressorStreamFactory.BZIP2, bufferStream(readFrom), decompressConcatenated);
106108
} catch (CompressorException e) {
107109
throw new MetafactureException(e);
108110
}
@@ -121,10 +123,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
121123
}
122124

123125
@Override
124-
public InputStream createDecompressor(final InputStream readFrom) {
126+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
125127
try {
126128
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
127-
CompressorStreamFactory.GZIP, bufferStream(readFrom));
129+
CompressorStreamFactory.GZIP, bufferStream(readFrom), decompressConcatenated);
128130
} catch (CompressorException e) {
129131
throw new MetafactureException(e);
130132
}
@@ -143,10 +145,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
143145
}
144146

145147
@Override
146-
public InputStream createDecompressor(final InputStream readFrom) {
148+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
147149
try {
148150
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
149-
CompressorStreamFactory.PACK200, bufferStream(readFrom));
151+
CompressorStreamFactory.PACK200, bufferStream(readFrom), decompressConcatenated);
150152
} catch (CompressorException e) {
151153
throw new MetafactureException(e);
152154
}
@@ -165,22 +167,32 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
165167
}
166168

167169
@Override
168-
public InputStream createDecompressor(final InputStream readFrom) {
170+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
169171
try {
170172
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
171-
CompressorStreamFactory.XZ, bufferStream(readFrom));
173+
CompressorStreamFactory.XZ, bufferStream(readFrom), decompressConcatenated);
172174
} catch (CompressorException e) {
173175
throw new MetafactureException(e);
174176
}
175177
}
176178
};
177179

178-
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = new CompressorStreamFactory();
180+
public static final boolean DEFAULT_DECOMPRESS_CONCATENATED = false;
181+
182+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(true);
183+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(false);
184+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = DEFAULT_DECOMPRESS_CONCATENATED ?
185+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED : APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED;
186+
179187
private static final int BUFFER_SIZE = 8 * 1024 * 1024;
180188

181189
public abstract OutputStream createCompressor(final OutputStream writeTo, final String fileName);
182190

183-
public abstract InputStream createDecompressor(final InputStream readFrom);
191+
public abstract InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated);
192+
193+
public InputStream createDecompressor(final InputStream readFrom) {
194+
return createDecompressor(readFrom, DEFAULT_DECOMPRESS_CONCATENATED);
195+
}
184196

185197
private static OutputStream bufferStream(final OutputStream stream) {
186198
if (stream instanceof BufferedOutputStream) {

metafacture-io/src/main/java/org/metafacture/io/FileOpener.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public final class FileOpener
4646

4747
private String encoding = "UTF-8";
4848
private FileCompression compression = FileCompression.AUTO;
49+
private boolean decompressConcatenated = FileCompression.DEFAULT_DECOMPRESS_CONCATENATED;
4950

5051
/**
5152
* Returns the encoding used to open the resource.
@@ -78,12 +79,20 @@ public void setCompression(final String compression) {
7879
setCompression(FileCompression.valueOf(compression.toUpperCase()));
7980
}
8081

82+
public boolean getDecompressConcatenated() {
83+
return decompressConcatenated;
84+
}
85+
86+
public void setDecompressConcatenated(final boolean decompressConcatenated) {
87+
this.decompressConcatenated = decompressConcatenated;
88+
}
89+
8190
@Override
8291
public void process(final String file) {
8392
try {
8493
final InputStream fileStream = new FileInputStream(file);
8594
try {
86-
final InputStream decompressor = compression.createDecompressor(fileStream);
95+
final InputStream decompressor = compression.createDecompressor(fileStream, decompressConcatenated);
8796
try {
8897

8998
final Reader reader = new InputStreamReader(new BOMInputStream(

metafacture-io/src/test/java/org/metafacture/io/FileOpenerCompressionTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public FileOpenerCompressionTest(final String resourcePath,
7676
public static Iterable<Object[]> data() {
7777
return Arrays.asList(new Object[][] {
7878
{ "compressed.txt", FileCompression.AUTO },
79+
{ "compressed.txt.bgzf", FileCompression.AUTO },
7980
{ "compressed.txt.bz2", FileCompression.AUTO },
8081
{ "compressed.txt.bzip2", FileCompression.AUTO },
8182
{ "compressed.txt.gz", FileCompression.AUTO },
@@ -84,6 +85,7 @@ public static Iterable<Object[]> data() {
8485
{ "compressed.txt", FileCompression.NONE },
8586
{ "compressed.txt.bz2", FileCompression.BZIP2 },
8687
{ "compressed.txt.bzip2", FileCompression.BZIP2 },
88+
{ "compressed.txt.bgzf", FileCompression.GZIP },
8789
{ "compressed.txt.gz", FileCompression.GZIP },
8890
{ "compressed.txt.gzip", FileCompression.GZIP },
8991
{ "compressed.txt.xz", FileCompression.XZ },

metafacture-io/src/test/java/org/metafacture/io/FileOpenerTest.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,21 @@
1616
package org.metafacture.io;
1717

1818
import static org.junit.Assert.assertEquals;
19+
import static org.junit.Assert.assertTrue;
1920
import static org.junit.Assume.assumeFalse;
2021
import static org.mockito.Mockito.verify;
2122

2223
import java.io.File;
2324
import java.io.FileOutputStream;
2425
import java.io.IOException;
26+
import java.io.InputStream;
27+
import java.io.InputStreamReader;
2528
import java.io.OutputStream;
2629
import java.io.Reader;
2730
import java.nio.charset.Charset;
2831
import java.nio.charset.StandardCharsets;
32+
import java.nio.file.Files;
33+
import java.nio.file.StandardCopyOption;
2934

3035
import org.junit.Rule;
3136
import org.junit.Test;
@@ -77,6 +82,43 @@ public void testUtf8IsDefaultEncoding() throws IOException {
7782
assertEquals(DATA, ResourceUtil.readAll(processedObject.getValue()));
7883
}
7984

85+
@Test
86+
public void testNoDecompressConcatenated() throws IOException {
87+
testDecompressConcatenated(false);
88+
}
89+
90+
@Test
91+
public void testDecompressConcatenated() throws IOException {
92+
testDecompressConcatenated(true);
93+
}
94+
95+
private void testDecompressConcatenated(final boolean decompressConcatenated) throws IOException {
96+
final int maxBytes = (int) Math.pow(2, 16); // BGZF max compressed block size
97+
final StringBuilder sb = new StringBuilder();
98+
99+
try (InputStreamReader r = new InputStreamReader(getClass().getResourceAsStream("compressed.txt"))) {
100+
final String data = ResourceUtil.readAll(r);
101+
for (int i = 0; i < 1525; i++) {
102+
sb.append(data).append("\n");
103+
}
104+
}
105+
106+
final String data = sb.toString();
107+
assertTrue(data.length() + " > " + maxBytes, data.length() > maxBytes);
108+
109+
final File testFile = copyResourceToTempFile("compressed-large.txt.bgzf");
110+
111+
final FileOpener opener = new FileOpener();
112+
opener.setDecompressConcatenated(decompressConcatenated);
113+
opener.setReceiver(receiver);
114+
opener.process(testFile.getAbsolutePath());
115+
opener.closeStream();
116+
117+
verify(receiver).process(processedObject.capture());
118+
assertEquals(decompressConcatenated ? data : data.substring(0, maxBytes),
119+
ResourceUtil.readAll(processedObject.getValue()));
120+
}
121+
80122
private File createTestFile() throws IOException {
81123
final File file = tempFolder.newFile();
82124
try (OutputStream stream = new FileOutputStream(file)) {
@@ -85,4 +127,12 @@ private File createTestFile() throws IOException {
85127
return file;
86128
}
87129

130+
private File copyResourceToTempFile(final String resourcePath) throws IOException {
131+
final File file = tempFolder.newFile();
132+
try (InputStream in = getClass().getResourceAsStream(resourcePath)) {
133+
Files.copy(in, file.toPath(), StandardCopyOption.REPLACE_EXISTING);
134+
}
135+
return file;
136+
}
137+
88138
}
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)