Skip to content

Commit 07db26d

Browse files
committed
Merge branch '350-subfields' into oersi
See #350
2 parents a66bf90 + 6916578 commit 07db26d

File tree

11 files changed

+191
-20
lines changed

11 files changed

+191
-20
lines changed

.github/workflows/build.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: Build
2+
3+
on: push
4+
jobs:
5+
build:
6+
runs-on: ubuntu-latest
7+
steps:
8+
- uses: actions/checkout@v2
9+
- name: Set up JDK 1.8
10+
uses: actions/setup-java@v1
11+
with:
12+
java-version: 1.8
13+
- name: Build with gradle
14+
run: ./gradlew check

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
![Metafacture](https://raw.github.com/wiki/metafacture/metafacture-core/img/metafacture.png)
22

3-
[![Build status](https://travis-ci.org/metafacture/metafacture-core.svg?branch=master)](https://travis-ci.org/metafacture/metafacture-core) [![Quality Gate](https://sonarcloud.io/api/project_badges/measure?project=org.metafacture:metafacture-core&metric=alert_status)](https://sonarcloud.io/dashboard/index/org.metafacture:metafacture-core)
3+
[![Build](https://github.com/metafacture/metafacture-core/workflows/Build/badge.svg?branch=master)](https://github.com/metafacture/metafacture-core/actions?query=workflow%3ABuild) [![Quality Gate](https://sonarcloud.io/api/project_badges/measure?project=org.metafacture:metafacture-core&metric=alert_status)](https://sonarcloud.io/dashboard/index/org.metafacture:metafacture-core)
44

55
Metafacture is a toolkit for processing semi-structured data with a focus on library metadata. It provides a versatile set of tools for reading, writing and transforming data. Metafacture can be used as a stand-alone application or as a Java library in other applications. The name Metafacture is a portmanteau of the words *meta* data and manu*facture*.
66

metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
1717

1818
import java.io.IOException;
1919
import java.io.Reader;
20+
import java.io.UnsupportedEncodingException;
21+
import java.net.URLDecoder;
22+
import java.nio.charset.StandardCharsets;
23+
import java.util.HashMap;
24+
import java.util.Map;
2025
import java.util.UUID;
2126

2227
import org.apache.commons.io.IOUtils;
@@ -38,12 +43,28 @@
3843
* @author Fabian Steeg (fsteeg)
3944
*
4045
*/
41-
@Description("Decode HTML to metadata events")
46+
@Description("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47+
+ "the default attribute values to be used as subfields (e.g. by default "
48+
+ "`link rel=\"canonical\" href=\"http://example.org\"` becomes `link.canonical`). "
49+
+ "It expects an HTTP-style query string specifying as key the attributes whose value should "
50+
+ "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51+
+ "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52+
+ "(instead of another attribute), omit the value of the query-string key-value pair, "
53+
+ "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54+
+ "e.g. `&h3.class`")
4255
@In(Reader.class)
4356
@Out(StreamReceiver.class)
4457
@FluxCommand("decode-html")
4558
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {
4659

60+
private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61+
"meta.name=content&meta.property=content&link.rel=href&a.rel=href";
62+
private Map<String, String> attrValsAsSubfields;
63+
64+
public HtmlDecoder() {
65+
setAttrValsAsSubfields(DEFAULT_ATTR_VALS_AS_SUBFIELDS);
66+
}
67+
4768
@Override
4869
public void process(final Reader reader) {
4970
try {
@@ -62,6 +83,7 @@ private void process(Element parent, StreamReceiver receiver) {
6283
receiver.startEntity(element.nodeName());
6384
Attributes attributes = element.attributes();
6485
for (Attribute attribute : attributes) {
86+
handleAttributeValuesAsSubfields(receiver, element, attributes, attribute);
6587
receiver.literal(attribute.getKey(), attribute.getValue());
6688
}
6789
if (element.children().isEmpty()) {
@@ -75,4 +97,35 @@ private void process(Element parent, StreamReceiver receiver) {
7597
receiver.endEntity();
7698
}
7799
}
100+
101+
private void handleAttributeValuesAsSubfields(StreamReceiver receiver, Element element,
102+
Attributes attributes, Attribute attribute) {
103+
String fullFieldKey = element.nodeName() + "." + attribute.getKey();
104+
if (attrValsAsSubfields.containsKey(fullFieldKey)) {
105+
String configValue = attrValsAsSubfields.get(fullFieldKey);
106+
if (configValue.trim().isEmpty()) {
107+
receiver.literal(attribute.getValue(), element.text().trim());
108+
} else {
109+
String value = attributes.get(configValue);
110+
receiver.literal(attribute.getValue(), value);
111+
}
112+
}
113+
}
114+
115+
public void setAttrValsAsSubfields(String mapString) {
116+
this.attrValsAsSubfields = new HashMap<String, String>();
117+
String input = mapString.startsWith("&") ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
118+
: mapString;
119+
for (String nameValuePair : input.split("&")) {
120+
String[] nameValue = nameValuePair.split("=");
121+
try {
122+
String utf8 = StandardCharsets.UTF_8.name();
123+
String key = URLDecoder.decode(nameValue[0], utf8);
124+
String val = nameValue.length > 1 ? URLDecoder.decode(nameValue[1], utf8) : "";
125+
attrValsAsSubfields.put(key, val);
126+
} catch (UnsupportedEncodingException e) {
127+
e.printStackTrace();
128+
}
129+
}
130+
}
78131
}

metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2020 Fabian Steeg, hbz
2+
* Copyright 2020, 2021 Fabian Steeg, hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -97,4 +97,35 @@ public void htmlScriptElementData() {
9797
ordered.verify(receiver, times(4)).endEntity();
9898
}
9999

100+
@Test
101+
public void htmlAttributesAsSubfieldsDefault() {
102+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"));
103+
final InOrder ordered = inOrder(receiver);
104+
ordered.verify(receiver).startEntity("meta");
105+
ordered.verify(receiver).literal("language", "DE");
106+
ordered.verify(receiver, times(4)).endEntity();
107+
}
108+
109+
@Test
110+
public void htmlAttributesAsSubfieldsCustom() {
111+
htmlDecoder.setAttrValsAsSubfields("mods:url.access");
112+
htmlDecoder.process(new StringReader("<mods:url access=\"preview\">file:///img.png</mods:url>"));
113+
final InOrder ordered = inOrder(receiver);
114+
ordered.verify(receiver).startEntity("mods:url");
115+
ordered.verify(receiver).literal("preview", "file:///img.png");
116+
ordered.verify(receiver, times(3)).endEntity();
117+
}
118+
119+
@Test
120+
public void htmlAttributesAsSubfieldsDefaultPlusCustom() {
121+
htmlDecoder.setAttrValsAsSubfields("&mods:url.access");
122+
htmlDecoder.process(new StringReader("<meta name=\"language\" content=\"DE\"/>"
123+
+ "<mods:url access=\"preview\">file:///img.png</mods:url>"));
124+
final InOrder ordered = inOrder(receiver);
125+
ordered.verify(receiver).startEntity("meta");
126+
ordered.verify(receiver).literal("language", "DE");
127+
ordered.verify(receiver).startEntity("mods:url");
128+
ordered.verify(receiver).literal("preview", "file:///img.png");
129+
ordered.verify(receiver, times(3)).endEntity();
130+
}
100131
}

metafacture-io/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies {
2121
api project(':metafacture-framework')
2222
implementation project(':metafacture-commons')
2323
implementation 'commons-io:commons-io:2.5'
24-
implementation 'org.apache.commons:commons-compress:1.12'
24+
implementation 'org.apache.commons:commons-compress:1.20'
2525
runtimeOnly 'org.tukaani:xz:1.6'
2626
testImplementation 'junit:junit:4.12'
2727
testImplementation 'org.mockito:mockito-core:2.5.5'

metafacture-io/src/main/java/org/metafacture/io/FileCompression.java

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
4343
}
4444

4545
@Override
46-
public InputStream createDecompressor(final InputStream readFrom) {
46+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
4747
return new ProxyInputStream(readFrom) {
4848
//nothing to do
4949
};
@@ -77,12 +77,14 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
7777
}
7878

7979
@Override
80-
public InputStream createDecompressor(final InputStream readFrom) {
80+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
8181
final InputStream bufferedStream = bufferStream(readFrom);
8282
try {
83-
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(bufferedStream);
83+
return decompressConcatenated ?
84+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream) :
85+
APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED.createCompressorInputStream(bufferedStream);
8486
} catch (CompressorException e) {
85-
return NONE.createDecompressor(bufferedStream);
87+
return NONE.createDecompressor(bufferedStream, decompressConcatenated);
8688
}
8789
}
8890
},
@@ -99,10 +101,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
99101
}
100102

101103
@Override
102-
public InputStream createDecompressor(final InputStream readFrom) {
104+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
103105
try {
104106
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
105-
CompressorStreamFactory.BZIP2, bufferStream(readFrom));
107+
CompressorStreamFactory.BZIP2, bufferStream(readFrom), decompressConcatenated);
106108
} catch (CompressorException e) {
107109
throw new MetafactureException(e);
108110
}
@@ -121,10 +123,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
121123
}
122124

123125
@Override
124-
public InputStream createDecompressor(final InputStream readFrom) {
126+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
125127
try {
126128
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
127-
CompressorStreamFactory.GZIP, bufferStream(readFrom));
129+
CompressorStreamFactory.GZIP, bufferStream(readFrom), decompressConcatenated);
128130
} catch (CompressorException e) {
129131
throw new MetafactureException(e);
130132
}
@@ -143,10 +145,10 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
143145
}
144146

145147
@Override
146-
public InputStream createDecompressor(final InputStream readFrom) {
148+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
147149
try {
148150
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
149-
CompressorStreamFactory.PACK200, bufferStream(readFrom));
151+
CompressorStreamFactory.PACK200, bufferStream(readFrom), decompressConcatenated);
150152
} catch (CompressorException e) {
151153
throw new MetafactureException(e);
152154
}
@@ -165,22 +167,32 @@ public OutputStream createCompressor(final OutputStream writeTo, final String fi
165167
}
166168

167169
@Override
168-
public InputStream createDecompressor(final InputStream readFrom) {
170+
public InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated) {
169171
try {
170172
return APACHE_COMPRESSOR_FACTORY.createCompressorInputStream(
171-
CompressorStreamFactory.XZ, bufferStream(readFrom));
173+
CompressorStreamFactory.XZ, bufferStream(readFrom), decompressConcatenated);
172174
} catch (CompressorException e) {
173175
throw new MetafactureException(e);
174176
}
175177
}
176178
};
177179

178-
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = new CompressorStreamFactory();
180+
public static final boolean DEFAULT_DECOMPRESS_CONCATENATED = false;
181+
182+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(true);
183+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED = new CompressorStreamFactory(false);
184+
private static final CompressorStreamFactory APACHE_COMPRESSOR_FACTORY = DEFAULT_DECOMPRESS_CONCATENATED ?
185+
APACHE_COMPRESSOR_FACTORY_DECOMPRESS_CONCATENATED : APACHE_COMPRESSOR_FACTORY_NO_DECOMPRESS_CONCATENATED;
186+
179187
private static final int BUFFER_SIZE = 8 * 1024 * 1024;
180188

181189
public abstract OutputStream createCompressor(final OutputStream writeTo, final String fileName);
182190

183-
public abstract InputStream createDecompressor(final InputStream readFrom);
191+
public abstract InputStream createDecompressor(final InputStream readFrom, final boolean decompressConcatenated);
192+
193+
public InputStream createDecompressor(final InputStream readFrom) {
194+
return createDecompressor(readFrom, DEFAULT_DECOMPRESS_CONCATENATED);
195+
}
184196

185197
private static OutputStream bufferStream(final OutputStream stream) {
186198
if (stream instanceof BufferedOutputStream) {

metafacture-io/src/main/java/org/metafacture/io/FileOpener.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public final class FileOpener
4646

4747
private String encoding = "UTF-8";
4848
private FileCompression compression = FileCompression.AUTO;
49+
private boolean decompressConcatenated = FileCompression.DEFAULT_DECOMPRESS_CONCATENATED;
4950

5051
/**
5152
* Returns the encoding used to open the resource.
@@ -78,12 +79,20 @@ public void setCompression(final String compression) {
7879
setCompression(FileCompression.valueOf(compression.toUpperCase()));
7980
}
8081

82+
public boolean getDecompressConcatenated() {
83+
return decompressConcatenated;
84+
}
85+
86+
public void setDecompressConcatenated(final boolean decompressConcatenated) {
87+
this.decompressConcatenated = decompressConcatenated;
88+
}
89+
8190
@Override
8291
public void process(final String file) {
8392
try {
8493
final InputStream fileStream = new FileInputStream(file);
8594
try {
86-
final InputStream decompressor = compression.createDecompressor(fileStream);
95+
final InputStream decompressor = compression.createDecompressor(fileStream, decompressConcatenated);
8796
try {
8897

8998
final Reader reader = new InputStreamReader(new BOMInputStream(

metafacture-io/src/test/java/org/metafacture/io/FileOpenerCompressionTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ public FileOpenerCompressionTest(final String resourcePath,
7676
public static Iterable<Object[]> data() {
7777
return Arrays.asList(new Object[][] {
7878
{ "compressed.txt", FileCompression.AUTO },
79+
{ "compressed.txt.bgzf", FileCompression.AUTO },
7980
{ "compressed.txt.bz2", FileCompression.AUTO },
8081
{ "compressed.txt.bzip2", FileCompression.AUTO },
8182
{ "compressed.txt.gz", FileCompression.AUTO },
@@ -84,6 +85,7 @@ public static Iterable<Object[]> data() {
8485
{ "compressed.txt", FileCompression.NONE },
8586
{ "compressed.txt.bz2", FileCompression.BZIP2 },
8687
{ "compressed.txt.bzip2", FileCompression.BZIP2 },
88+
{ "compressed.txt.bgzf", FileCompression.GZIP },
8789
{ "compressed.txt.gz", FileCompression.GZIP },
8890
{ "compressed.txt.gzip", FileCompression.GZIP },
8991
{ "compressed.txt.xz", FileCompression.XZ },

metafacture-io/src/test/java/org/metafacture/io/FileOpenerTest.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,21 @@
1616
package org.metafacture.io;
1717

1818
import static org.junit.Assert.assertEquals;
19+
import static org.junit.Assert.assertTrue;
1920
import static org.junit.Assume.assumeFalse;
2021
import static org.mockito.Mockito.verify;
2122

2223
import java.io.File;
2324
import java.io.FileOutputStream;
2425
import java.io.IOException;
26+
import java.io.InputStream;
27+
import java.io.InputStreamReader;
2528
import java.io.OutputStream;
2629
import java.io.Reader;
2730
import java.nio.charset.Charset;
2831
import java.nio.charset.StandardCharsets;
32+
import java.nio.file.Files;
33+
import java.nio.file.StandardCopyOption;
2934

3035
import org.junit.Rule;
3136
import org.junit.Test;
@@ -77,6 +82,43 @@ public void testUtf8IsDefaultEncoding() throws IOException {
7782
assertEquals(DATA, ResourceUtil.readAll(processedObject.getValue()));
7883
}
7984

85+
@Test
86+
public void testNoDecompressConcatenated() throws IOException {
87+
testDecompressConcatenated(false);
88+
}
89+
90+
@Test
91+
public void testDecompressConcatenated() throws IOException {
92+
testDecompressConcatenated(true);
93+
}
94+
95+
private void testDecompressConcatenated(final boolean decompressConcatenated) throws IOException {
96+
final int maxBytes = (int) Math.pow(2, 16); // BGZF max compressed block size
97+
final StringBuilder sb = new StringBuilder();
98+
99+
try (InputStreamReader r = new InputStreamReader(getClass().getResourceAsStream("compressed.txt"))) {
100+
final String data = ResourceUtil.readAll(r);
101+
for (int i = 0; i < 1525; i++) {
102+
sb.append(data).append("\n");
103+
}
104+
}
105+
106+
final String data = sb.toString();
107+
assertTrue(data.length() + " > " + maxBytes, data.length() > maxBytes);
108+
109+
final File testFile = copyResourceToTempFile("compressed-large.txt.bgzf");
110+
111+
final FileOpener opener = new FileOpener();
112+
opener.setDecompressConcatenated(decompressConcatenated);
113+
opener.setReceiver(receiver);
114+
opener.process(testFile.getAbsolutePath());
115+
opener.closeStream();
116+
117+
verify(receiver).process(processedObject.capture());
118+
assertEquals(decompressConcatenated ? data : data.substring(0, maxBytes),
119+
ResourceUtil.readAll(processedObject.getValue()));
120+
}
121+
80122
private File createTestFile() throws IOException {
81123
final File file = tempFolder.newFile();
82124
try (OutputStream stream = new FileOutputStream(file)) {
@@ -85,4 +127,12 @@ private File createTestFile() throws IOException {
85127
return file;
86128
}
87129

130+
private File copyResourceToTempFile(final String resourcePath) throws IOException {
131+
final File file = tempFolder.newFile();
132+
try (InputStream in = getClass().getResourceAsStream(resourcePath)) {
133+
Files.copy(in, file.toPath(), StandardCopyOption.REPLACE_EXISTING);
134+
}
135+
return file;
136+
}
137+
88138
}
Binary file not shown.

0 commit comments

Comments
 (0)