Skip to content

Commit 9ad842e

Browse files
committed
Adds reader for multiline formeta records
The `FormetaDecoder` processes only individual records. If the `LineReader` or `RecordReader` modules are used for splitting an input stream consisting of a sequence of formeta records, this is quite inconvenient as records either have to by in a single line or must be seperated with a record separater character. As such a character is not defined for the formeta format this is not a good solution. However, the structure of formeta allows to recognise the end of a record quite easily without having to properly parse the record. The new module `FormetaRecordsReader` implements such a splitter. It splits a sequence of formeta records between each record.
1 parent 6d9d79b commit 9ad842e

File tree

3 files changed

+198
-1
lines changed

3 files changed

+198
-1
lines changed
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
* Copyright 2014 Christoph Böhme
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License"; you may not use
5+
* this file except in compliance with the License. You may obtain a copy of the
6+
* License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12+
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13+
* License for the specific language governing permissions and limitations under
14+
* the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter;
17+
18+
import java.io.IOException;
19+
import java.io.Reader;
20+
21+
import org.culturegraph.mf.exceptions.MetafactureException;
22+
import org.culturegraph.mf.formeta.Formeta;
23+
import org.culturegraph.mf.framework.DefaultObjectPipe;
24+
import org.culturegraph.mf.framework.ObjectReceiver;
25+
import org.culturegraph.mf.framework.annotations.Description;
26+
import org.culturegraph.mf.framework.annotations.In;
27+
import org.culturegraph.mf.framework.annotations.Out;
28+
29+
/**
30+
* Reads a stream of formeta data and splits between each top-level element.
31+
*
32+
* @author Christoph Böhme
33+
*
34+
*/
35+
@In(Reader.class)
36+
@Out(String.class)
37+
@Description("Reads a stream of formeta data and splits between each top-level element")
38+
public final class FormetaRecordsReader extends
39+
DefaultObjectPipe<Reader, ObjectReceiver<String>> {
40+
41+
private static final int BUFFER_SIZE = 1024 * 1024 * 16;
42+
43+
private final StringBuilder builder = new StringBuilder();
44+
private final char[] buffer = new char[BUFFER_SIZE];
45+
46+
@Override
47+
public void process(final Reader reader) {
48+
assert !isClosed();
49+
50+
try {
51+
boolean readSomething = false;
52+
boolean inQuotedText = false;
53+
int groupLevel = 0;
54+
int size;
55+
while ((size = reader.read(buffer)) != -1) {
56+
readSomething = true;
57+
int offset = 0;
58+
for (int i = 0; i < size; ++i) {
59+
switch (buffer[i]) {
60+
case Formeta.ESCAPE_CHAR:
61+
i += 1; // Skip next character
62+
break;
63+
case Formeta.GROUP_START:
64+
if (!inQuotedText) {
65+
groupLevel += 1;
66+
}
67+
break;
68+
case Formeta.GROUP_END:
69+
if (!inQuotedText) {
70+
groupLevel -= 1;
71+
}
72+
// Fall through
73+
case Formeta.ITEM_SEPARATOR:
74+
if (!inQuotedText && groupLevel == 0) {
75+
builder.append(buffer, offset, i - offset + 1);
76+
offset = i + 1;
77+
emitRecord();
78+
}
79+
break;
80+
case Formeta.QUOT_CHAR:
81+
inQuotedText = !inQuotedText;
82+
break;
83+
}
84+
}
85+
builder.append(buffer, offset, size - offset);
86+
}
87+
if (readSomething) {
88+
emitRecord();
89+
}
90+
91+
} catch (final IOException e) {
92+
throw new MetafactureException(e);
93+
}
94+
}
95+
96+
private void emitRecord() {
97+
final String record = builder.toString();
98+
getReceiver().process(record);
99+
builder.delete(0, builder.length());
100+
}
101+
102+
}

src/main/resources/flux-commands.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jscript org.culturegraph.mf.stream.pipe.JScriptObjectPipe
2222

2323
as-lines org.culturegraph.mf.stream.converter.LineReader
2424
as-records org.culturegraph.mf.stream.converter.RecordReader
25-
25+
as-formeta-records org.culturegraph.mf.stream.converter.FormetaRecordsReader
2626
# Decoders:
2727
decode-pica org.culturegraph.mf.stream.converter.bib.PicaDecoder
2828
decode-mab org.culturegraph.mf.stream.converter.bib.MabDecoder
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package org.culturegraph.mf.stream.converter;
2+
3+
import static org.mockito.Mockito.verify;
4+
import static org.mockito.Mockito.verifyNoMoreInteractions;
5+
6+
import java.io.StringReader;
7+
8+
import org.culturegraph.mf.framework.ObjectReceiver;
9+
import org.junit.After;
10+
import org.junit.Before;
11+
import org.junit.Test;
12+
import org.mockito.Mock;
13+
import org.mockito.MockitoAnnotations;
14+
15+
public class FormetaRecordsReaderTest {
16+
17+
private static String SINGLE_RECORD = "l: v";
18+
19+
private static String RECORD_LITERAL = " l: v,";
20+
private static String RECORD_GROUP = " r{l: v}";
21+
private static String RECORD_NESTED_GROUP = " r{ e { l: v } }";
22+
private static String RECORD_QUOTED_LITERAL = " 'l x': v,";
23+
private static String RECORD_LEFT_BRACE_IN_QUOTES = " '{': l,";
24+
private static String RECORD_RIGHT_BRACE_IN_QUOTES = " r{ l: '}' }";
25+
private static String RECORD_COLON_IN_QUOTES = " ':': v,";
26+
private static String RECORD_COMMA_IN_QUOTES = " l: ',v:v',";
27+
private static String RECORD_ESCAPED_LEFT_BRACE = " \\{: v,";
28+
private static String RECORD_ESCAPED_RIGHT_BRACE = " r{ l: \\} }";
29+
private static String RECORD_ESCAPED_COLON = " \\:: v,";
30+
private static String RECORD_ESCAPED_COMMA = " l: \\,v\\:v,";
31+
private static String RECORD_ESCAPED_QUOTE = " '\\',': v";
32+
33+
private FormetaRecordsReader formetaRecordsReader;
34+
35+
@Mock
36+
private ObjectReceiver<String> receiver;
37+
38+
@Before
39+
public void setup() {
40+
MockitoAnnotations.initMocks(this);
41+
formetaRecordsReader = new FormetaRecordsReader();
42+
formetaRecordsReader.setReceiver(receiver);
43+
}
44+
45+
@After
46+
public void cleanup() {
47+
formetaRecordsReader.closeStream();
48+
}
49+
50+
@Test
51+
public void shouldProcessSingleRecord() {
52+
final StringReader reader = new StringReader(SINGLE_RECORD);
53+
54+
formetaRecordsReader.process(reader);
55+
56+
verify(receiver).process(SINGLE_RECORD);
57+
verifyNoMoreInteractions(receiver);
58+
}
59+
60+
@Test
61+
public void shouldSplitBetweenTopLevelElements() {
62+
final String records = RECORD_LITERAL +
63+
RECORD_GROUP +
64+
RECORD_NESTED_GROUP +
65+
RECORD_QUOTED_LITERAL +
66+
RECORD_LEFT_BRACE_IN_QUOTES +
67+
RECORD_RIGHT_BRACE_IN_QUOTES +
68+
RECORD_COLON_IN_QUOTES +
69+
RECORD_COMMA_IN_QUOTES +
70+
RECORD_ESCAPED_LEFT_BRACE +
71+
RECORD_ESCAPED_RIGHT_BRACE +
72+
RECORD_ESCAPED_COLON +
73+
RECORD_ESCAPED_COMMA +
74+
RECORD_ESCAPED_QUOTE;
75+
76+
final StringReader reader = new StringReader(records);
77+
78+
formetaRecordsReader.process(reader);
79+
80+
verify(receiver).process(RECORD_LITERAL);
81+
verify(receiver).process(RECORD_GROUP);
82+
verify(receiver).process(RECORD_NESTED_GROUP);
83+
verify(receiver).process(RECORD_QUOTED_LITERAL);
84+
verify(receiver).process(RECORD_LEFT_BRACE_IN_QUOTES);
85+
verify(receiver).process(RECORD_RIGHT_BRACE_IN_QUOTES);
86+
verify(receiver).process(RECORD_COLON_IN_QUOTES);
87+
verify(receiver).process(RECORD_COMMA_IN_QUOTES);
88+
verify(receiver).process(RECORD_ESCAPED_LEFT_BRACE);
89+
verify(receiver).process(RECORD_ESCAPED_RIGHT_BRACE);
90+
verify(receiver).process(RECORD_ESCAPED_COLON);
91+
verify(receiver).process(RECORD_ESCAPED_COMMA);
92+
verify(receiver).process(RECORD_ESCAPED_QUOTE);
93+
verifyNoMoreInteractions(receiver);
94+
}
95+
}

0 commit comments

Comments
 (0)