Skip to content

Commit 3c75b41

Browse files
committed
Re-implemented PicaDecoder based on a state machine.
The old PicaDecoder used regular expressions to parse PICA+ records. This let to two problems: * Errors in the data resulted in exceptions which did not refer to the portion of the data that caused the problem (e.g. a character index) * Due to the use of String.substring() for extracting data from the record the full record was kept in memory (see issue #51) The new PicaDecoder was written to solve these problems. The first one was addressed by constructing the parser so that it only fails in two clearly defined situations (missing id field and unexpected end of record). The second one was solved by copying the parsed data portions into new strings. In addition to the problems listed above, the following issues were addressed: * #109 -- removed support for static usages of the encoder * #112 -- removed support for appendControlSubField. If Metamorph is extended to pass data through (issue #107), this functionality can easily be implemented in a script. It is also not clear how widely it is used at all. While having removed support for control subfields the new decoder introduces a range of new options: * ignore missing id -- do not fail on missing ids but use an empty string as record id * skip empty fields -- do not output fields without subfields or empty subfields only (i.e. subfields without name and value) * fix unexpected end of record -- if a record does not end with a field delimiter one will be automatically added. * normalize UTF8 -- automatically performs UTF8 normalization of values The unit tests have been rewritten to match the new options and to be more useful for debugging.
1 parent b984e4c commit 3c75b41

File tree

8 files changed

+701
-182
lines changed

8 files changed

+701
-182
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright 2013 Christoph Böhme
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter.bib;
17+
18+
/**
19+
* Useful constants for PICA+
20+
*
21+
* @author Christoph Böhme
22+
*
23+
*/
24+
final class PicaConstants {
25+
26+
public static final char FIELD_DELIMITER = '\u001e';
27+
public static final char SUBFIELD_DELIMITER = '\u001f';
28+
29+
private PicaConstants() {
30+
// No instances allowed
31+
}
32+
33+
}
Lines changed: 142 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2013 Deutsche Nationalbibliothek
2+
* Copyright 2013 Christoph Böhme
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -15,11 +15,6 @@
1515
*/
1616
package org.culturegraph.mf.stream.converter.bib;
1717

18-
import java.text.Normalizer;
19-
import java.text.Normalizer.Form;
20-
import java.util.regex.Matcher;
21-
import java.util.regex.Pattern;
22-
2318
import org.culturegraph.mf.exceptions.FormatException;
2419
import org.culturegraph.mf.framework.DefaultObjectPipe;
2520
import org.culturegraph.mf.framework.StreamReceiver;
@@ -29,101 +24,172 @@
2924

3025

3126
/**
32-
* Parses a raw Picaplus stream (utf8 encoding assumed). Events are handled by a
33-
* {@link StreamReceiver}.
27+
* Parses a PICA+ record with UTF8 encoding assumed.
28+
*
29+
* For each field in the stream the module calls:
30+
* <ol>
31+
* <li>receiver.startEntity</li>
32+
* <li>receiver.literal for each subfield of the field</li>
33+
* <li>receiver.endEntity</li>
34+
* </ol>
35+
*
36+
* Spaces in the field name are not included in the entity name.
37+
*
38+
* Empty subfields are skipped. For instance, processing the following input
39+
* would NOT produce an empty literal: 003@ \u001f\u001e
3440
*
35-
* @see StreamReceiver
41+
* If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
42+
* record a {@link MissingIdException} is thrown.
3643
*
37-
* @author Markus Michael Geipel, Christoph Böhme
44+
* @author Christoph Böhme
3845
*
3946
*/
40-
@Description("Parses a raw Picaplus stream (utf8 encoding assumed).")
47+
@Description("Parses a PICA+ record with UTF8 encoding assumed.")
4148
@In(String.class)
4249
@Out(StreamReceiver.class)
43-
public final class PicaDecoder
50+
public final class PicaDecoder
4451
extends DefaultObjectPipe<String, StreamReceiver> {
4552

46-
private static final String FIELD_DELIMITER = "\u001e";
47-
private static final String SUB_DELIMITER = "\u001f";
48-
private static final Pattern FIELD_PATTERN = Pattern.compile(
49-
FIELD_DELIMITER, Pattern.LITERAL);
50-
private static final Pattern SUBFIELD_PATTERN = Pattern.compile(
51-
SUB_DELIMITER, Pattern.LITERAL);
52-
private static final String ID_PATTERN_STRING = FIELD_DELIMITER + "003@ "
53-
+ SUB_DELIMITER + "0(.*?)" + FIELD_DELIMITER;
54-
private static final Pattern ID_PATTERN = Pattern
55-
.compile(ID_PATTERN_STRING);
56-
private static boolean appendControlSubField = true;
53+
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'};
54+
55+
private static final int BUFFER_SIZE = 1024 * 1024;
56+
57+
private final StringBuilder idBuilder = new StringBuilder();
58+
private final PicaParserContext parserContext = new PicaParserContext();
59+
60+
private char[] buffer = new char[BUFFER_SIZE];
61+
private int recordLen;
62+
63+
private boolean ignoreMissingIdn;
64+
private boolean fixUnexpectedEOR;
5765

58-
/**
59-
* For each field in the stream the method calls:
60-
* <ol>
61-
* <li>receiver.startEntity</li>
62-
* <li>receiver.literal for each subfield of the field</li>
63-
* <li>receiver.endEntity</li>
64-
* </ol>
65-
* Fields without any subfield will be skipped.<br>
66-
* <strong>Special handling of subfield 'S':</strong> the code of
67-
* "control subfields" (subfield name='S') will be appended to the
68-
* fieldName. E.g.: 041A $Sa would be mapped to the fieldName 041Aa
69-
*
70-
* @param record
71-
*/
66+
public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
67+
this.ignoreMissingIdn = ignoreMissingIdn;
68+
}
69+
70+
public boolean getIgnoreMissingIdn() {
71+
return ignoreMissingIdn;
72+
}
73+
74+
public void setFixUnexpectedEOR(final boolean fixUnexpectedEOR) {
75+
this.fixUnexpectedEOR = fixUnexpectedEOR;
76+
}
77+
78+
public boolean getFixUnexpectedEOR() {
79+
return fixUnexpectedEOR;
80+
}
81+
82+
public void setNormalizeUTF8(final boolean normalizeUTF8) {
83+
parserContext.setNormalizeUTF8(normalizeUTF8);
84+
}
85+
86+
public boolean getNormalizeUTF8() {
87+
return parserContext.getNormalizeUTF8();
88+
}
89+
90+
public void setSkipEmptyFields(final boolean skipEmptyFields) {
91+
parserContext.setSkipEmptyFields(skipEmptyFields);
92+
}
93+
94+
public boolean getSkipEmptyFields() {
95+
return parserContext.getSkipEmptyFields();
96+
}
97+
7298
@Override
7399
public void process(final String record) {
74100
assert !isClosed();
75-
process(record, getReceiver());
76-
}
101+
102+
copyToBuffer(record);
103+
104+
if (recordIsEmpty()) {
105+
return;
106+
}
107+
108+
String id = extractRecordId();
109+
if (id == null) {
110+
if (!ignoreMissingIdn) {
111+
throw new MissingIdException("Record has no id");
112+
}
113+
id = "";
114+
}
115+
getReceiver().startRecord(id);
77116

78-
public static void setAppendControlSubField(final boolean appendControlSubField) {
79-
PicaDecoder.appendControlSubField = appendControlSubField;
117+
PicaParserState state = PicaParserState.FIELD_NAME;
118+
for (int i = 0; i < recordLen; ++i) {
119+
state = state.parseChar(buffer[i], parserContext);
120+
}
121+
if (state != PicaParserState.FIELD_NAME || parserContext.hasUnprocessedText()) {
122+
if (fixUnexpectedEOR) {
123+
state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext);
124+
assert state == PicaParserState.FIELD_NAME;
125+
assert !parserContext.hasUnprocessedText();
126+
} else {
127+
throw new FormatException("Unexpected end of record");
128+
}
129+
}
130+
131+
getReceiver().endRecord();
132+
}
133+
134+
@Override
135+
protected void onSetReceiver() {
136+
parserContext.setReceiver(getReceiver());
137+
}
138+
139+
@Override
140+
protected void onResetStream() {
141+
parserContext.reset();
80142
}
81143

82-
public static String extractIdFromRecord(final String record) {
83-
final Matcher idMatcher = ID_PATTERN.matcher(record);
84-
if (idMatcher.find()) {
85-
return idMatcher.group(1);
144+
private void copyToBuffer(final String record) {
145+
recordLen = record.length();
146+
if(recordLen > buffer.length) {
147+
buffer = new char[buffer.length * 2];
86148
}
87-
throw new MissingIdException(record);
149+
record.getChars(0, recordLen, buffer, 0);
88150
}
89151

90-
public static void process(final String rawRecord, final StreamReceiver receiver) {
91-
if (rawRecord.trim().isEmpty()) {
92-
return;
152+
private boolean recordIsEmpty() {
153+
for (int i = 0; i < recordLen; ++i) {
154+
if (buffer[i] != ' ' && buffer[i] != '\t') {
155+
return false;
156+
}
93157
}
158+
return true;
159+
}
160+
161+
private String extractRecordId() {
162+
idBuilder.setLength(0);
94163

95-
final String record = Normalizer.normalize(rawRecord, Form.NFC);
96-
try {
97-
receiver.startRecord(extractIdFromRecord(record));
98-
99-
for (String field : FIELD_PATTERN.split(record)) {
100-
final String[] subfields = SUBFIELD_PATTERN.split(field);
101-
if (subfields.length > 1) {
102-
final String fieldName;
103-
final int firstSubfield;
104-
if (subfields[1].charAt(0) == 'S' && appendControlSubField ) {
105-
fieldName = subfields[0].trim() + subfields[1].charAt(1);
106-
firstSubfield = 2;
164+
int fieldPos = 0;
165+
boolean skip = false;
166+
for (int i = 0; i < recordLen; ++i) {
167+
if (buffer[i] == PicaConstants.FIELD_DELIMITER) {
168+
if (idBuilder.length() > 0) {
169+
return idBuilder.toString();
170+
}
171+
fieldPos = 0;
172+
skip = false;
173+
continue;
174+
}
175+
if (!skip) {
176+
if (fieldPos < ID_FIELD.length) {
177+
if (buffer[i] == ID_FIELD[fieldPos]) {
178+
fieldPos += 1;
107179
} else {
108-
fieldName = subfields[0].trim();
109-
firstSubfield = 1;
180+
skip = true;
110181
}
111-
112-
receiver.startEntity(fieldName);
113-
114-
for (int i = firstSubfield; i < subfields.length; ++i) {
115-
final String subfield = subfields[i];
116-
receiver.literal(subfield.substring(0, 1),
117-
subfield.substring(1));
182+
} else {
183+
if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) {
184+
skip = true;
185+
} else {
186+
idBuilder.append(buffer[i]);
118187
}
119-
receiver.endEntity();
120188
}
121189
}
122-
123-
receiver.endRecord();
124-
} catch (IndexOutOfBoundsException e) {
125-
throw new FormatException(e);
126-
}
190+
}
191+
192+
return null;
127193
}
128194

129195
}

0 commit comments

Comments
 (0)