Skip to content

Commit 93411e8

Browse files
committed
fixed #10
Extracted Formeta parser from FormetaDecoder.
1 parent 5670f12 commit 93411e8

File tree

13 files changed

+965
-782
lines changed

13 files changed

+965
-782
lines changed

src/main/java/org/culturegraph/mf/types/Formeta.java renamed to src/main/java/org/culturegraph/mf/formeta/Formeta.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16-
package org.culturegraph.mf.types;
16+
package org.culturegraph.mf.formeta;
1717

1818
/**
19-
* Constant definitions for the Metafor format.
19+
* Constant definitions for the Formeta format.
2020
*
2121
* @author Christoph Böhme
2222
*
@@ -38,4 +38,9 @@ public final class Formeta {
3838
private Formeta() {
3939
// No instances allowed
4040
}
41+
42+
public static boolean isWhitespace(final char ch) {
43+
return WHITESPACE.indexOf(ch) > -1;
44+
}
45+
4146
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.formeta.parser;
17+
18+
import org.culturegraph.mf.framework.StreamReceiver;
19+
20+
/**
21+
* Interface for event emitters.
22+
*
23+
* @author Christoph Böhme
24+
*
25+
*/
26+
public interface Emitter {
27+
28+
void setReceiver(final StreamReceiver receiver);
29+
30+
void startGroup(final String name, final int nestingLevel);
31+
32+
void endGroup(final int nestingLevel);
33+
34+
void literal(final String name, final String value, final int nestingLevel);
35+
36+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.formeta.parser;
17+
18+
import org.culturegraph.mf.exceptions.FormatException;
19+
20+
/**
21+
* A parser for the formeta metadata serialisation format.
22+
*
23+
* @author Christoph Böhme
24+
*
25+
*/
26+
public final class FormetaParser {
27+
28+
public static final int SNIPPET_SIZE = 20;
29+
public static final String SNIPPET_ELLIPSIS = "\u2026";
30+
public static final String POS_MARKER_LEFT = ">";
31+
public static final String POS_MARKER_RIGHT = "<";
32+
33+
private static final int BUFFER_SIZE = 1024 * 1024;
34+
35+
private char[] buffer = new char[BUFFER_SIZE];
36+
private final StructureParserContext structureParserContext = new StructureParserContext();
37+
38+
public void setEmitter(final Emitter emitter) {
39+
structureParserContext.setEmitter(emitter);
40+
}
41+
42+
public Emitter getEmitter() {
43+
return structureParserContext.getEmitter();
44+
}
45+
46+
public void parse(final String data) {
47+
assert structureParserContext.getEmitter() != null: "No emitter set";
48+
49+
// According to http://stackoverflow.com/a/11876086 it is faster to copy
50+
// the string into a char array than to use charAt():
51+
final int recordLen = data.length();
52+
if(recordLen > buffer.length) {
53+
buffer = new char[buffer.length * 2];
54+
}
55+
data.getChars(0, recordLen, buffer, 0);
56+
57+
structureParserContext.reset();
58+
StructureParserState state = StructureParserState.ITEM_NAME;
59+
int i = 0;
60+
try {
61+
for (; i < recordLen; ++i) {
62+
state = state.processChar(buffer[i], structureParserContext);
63+
}
64+
} catch (FormatException e) {
65+
final String errorMsg = "Parsing error at position "
66+
+ (i + 1) + ": "
67+
+ getErrorSnippet(data, i) + ", "
68+
+ e.getMessage();
69+
throw new FormatException(errorMsg, e);
70+
}
71+
try {
72+
state.endOfInput(structureParserContext);
73+
} catch (FormatException e) {
74+
throw new FormatException("Parsing error: " + e.getMessage(), e);
75+
}
76+
}
77+
78+
/**
79+
* Extracts a text snippet from the record for showing the position at
80+
* which an error occurred. The exact position additionally highlighted
81+
* with {@link POS_MARKER_LEFT} and {@link POS_MARKER_RIGHT}.
82+
*
83+
* @param record the record currently being parsed
84+
* @param pos the position at which the error occurred
85+
* @return a text snippet.
86+
*/
87+
private static String getErrorSnippet(final String record, final int pos) {
88+
final StringBuilder snippet = new StringBuilder();
89+
90+
final int start = pos - SNIPPET_SIZE / 2;
91+
if (start < 0) {
92+
snippet.append(record.substring(0, pos));
93+
} else {
94+
snippet.append(SNIPPET_ELLIPSIS);
95+
snippet.append(record.substring(start, pos));
96+
}
97+
98+
snippet.append(POS_MARKER_LEFT);
99+
snippet.append(record.charAt(pos));
100+
snippet.append(POS_MARKER_RIGHT);
101+
102+
if (pos + 1 < record.length()) {
103+
final int end = pos + SNIPPET_SIZE / 2;
104+
if (end > record.length()) {
105+
snippet.append(record.substring(pos + 1));
106+
} else {
107+
snippet.append(record.substring(pos + 1, end));
108+
snippet.append(SNIPPET_ELLIPSIS);
109+
}
110+
}
111+
112+
return snippet.toString();
113+
}
114+
115+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.formeta.parser;
17+
18+
import org.culturegraph.mf.exceptions.FormatException;
19+
import org.culturegraph.mf.framework.StreamReceiver;
20+
21+
/**
22+
* Emits full records.
23+
*
24+
* @author Christoph Böhme
25+
*
26+
*/
27+
public final class FullRecordEmitter implements Emitter {
28+
29+
private StreamReceiver receiver;
30+
31+
@Override
32+
public void setReceiver(final StreamReceiver receiver) {
33+
this.receiver = receiver;
34+
}
35+
36+
@Override
37+
public void startGroup(final String name, final int nestingLevel) {
38+
if (nestingLevel == 0) {
39+
receiver.startRecord(name);
40+
} else {
41+
receiver.startEntity(name);
42+
}
43+
}
44+
45+
@Override
46+
public void endGroup(final int nestingLevel) {
47+
if (nestingLevel == 0) {
48+
receiver.endRecord();
49+
} else {
50+
receiver.endEntity();
51+
}
52+
}
53+
54+
@Override
55+
public void literal(final String name, final String value, final int nestingLevel) {
56+
if (nestingLevel == 0) {
57+
throw new FormatException("literals may only appear in records");
58+
}
59+
receiver.literal(name, value);
60+
}
61+
62+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.formeta.parser;
17+
18+
import org.culturegraph.mf.framework.StreamReceiver;
19+
20+
/**
21+
* Emits partial records.
22+
*/
23+
public final class PartialRecordEmitter implements Emitter {
24+
25+
private StreamReceiver receiver;
26+
private String defaultName;
27+
28+
public PartialRecordEmitter() {
29+
this(null);
30+
}
31+
32+
public PartialRecordEmitter(final String defaultName) {
33+
this.defaultName = defaultName;
34+
}
35+
36+
@Override
37+
public void setReceiver(final StreamReceiver receiver) {
38+
this.receiver = receiver;
39+
}
40+
41+
@Override
42+
public void startGroup(final String name, final int nestingLevel) {
43+
if (defaultName != null && name.isEmpty()) {
44+
receiver.startEntity(defaultName);
45+
} else {
46+
receiver.startEntity(name);
47+
}
48+
}
49+
50+
@Override
51+
public void endGroup(final int nestingLevel) {
52+
receiver.endEntity();
53+
}
54+
55+
@Override
56+
public void literal(final String name, final String value, final int nestingLevel) {
57+
if (defaultName != null && name.isEmpty()) {
58+
receiver.literal(defaultName, value);
59+
} else {
60+
receiver.literal(name, value);
61+
}
62+
}
63+
64+
}

0 commit comments

Comments
 (0)