Skip to content

Commit 6d2c9c9

Browse files
author
Markus M. Geipel
committed
Merge pull request #113 from cboehme/parsing-pica-decoder
Re-implemented PicaDecoder based on a state machine.
2 parents c8e622a + 9e736df commit 6d2c9c9

File tree

8 files changed

+778
-182
lines changed

8 files changed

+778
-182
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright 2013 Christoph Böhme
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter.bib;
17+
18+
/**
19+
* Useful constants for PICA+
20+
*
21+
* @author Christoph Böhme
22+
*
23+
*/
24+
final class PicaConstants {
25+
26+
public static final char FIELD_DELIMITER = '\u001e';
27+
public static final char SUBFIELD_DELIMITER = '\u001f';
28+
29+
private PicaConstants() {
30+
// No instances allowed
31+
}
32+
33+
}
Lines changed: 142 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2013 Deutsche Nationalbibliothek
2+
* Copyright 2013 Christoph Böhme
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -15,11 +15,6 @@
1515
*/
1616
package org.culturegraph.mf.stream.converter.bib;
1717

18-
import java.text.Normalizer;
19-
import java.text.Normalizer.Form;
20-
import java.util.regex.Matcher;
21-
import java.util.regex.Pattern;
22-
2318
import org.culturegraph.mf.exceptions.FormatException;
2419
import org.culturegraph.mf.framework.DefaultObjectPipe;
2520
import org.culturegraph.mf.framework.StreamReceiver;
@@ -29,101 +24,172 @@
2924

3025

3126
/**
32-
* Parses a raw Picaplus stream (utf8 encoding assumed). Events are handled by a
33-
* {@link StreamReceiver}.
27+
* Parses a PICA+ record with UTF8 encoding assumed.
28+
*
29+
* For each field in the stream the module calls:
30+
* <ol>
31+
* <li>receiver.startEntity</li>
32+
* <li>receiver.literal for each subfield of the field</li>
33+
* <li>receiver.endEntity</li>
34+
* </ol>
35+
*
36+
* Spaces in the field name are not included in the entity name.
37+
*
38+
* Empty subfields are skipped. For instance, processing the following input
39+
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
40+
* skips unnamed fields without any subfields.
3441
*
35-
* @see StreamReceiver
42+
* If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
43+
* record a {@link MissingIdException} is thrown.
3644
*
37-
* @author Markus Michael Geipel, Christoph Böhme
45+
* @author Christoph Böhme
3846
*
3947
*/
40-
@Description("Parses a raw Picaplus stream (utf8 encoding assumed).")
48+
@Description("Parses a PICA+ record with UTF8 encoding assumed.")
4149
@In(String.class)
4250
@Out(StreamReceiver.class)
43-
public final class PicaDecoder
51+
public final class PicaDecoder
4452
extends DefaultObjectPipe<String, StreamReceiver> {
4553

46-
private static final String FIELD_DELIMITER = "\u001e";
47-
private static final String SUB_DELIMITER = "\u001f";
48-
private static final Pattern FIELD_PATTERN = Pattern.compile(
49-
FIELD_DELIMITER, Pattern.LITERAL);
50-
private static final Pattern SUBFIELD_PATTERN = Pattern.compile(
51-
SUB_DELIMITER, Pattern.LITERAL);
52-
private static final String ID_PATTERN_STRING = FIELD_DELIMITER + "003@ "
53-
+ SUB_DELIMITER + "0(.*?)" + FIELD_DELIMITER;
54-
private static final Pattern ID_PATTERN = Pattern
55-
.compile(ID_PATTERN_STRING);
56-
private static boolean appendControlSubField = true;
54+
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'};
55+
56+
private static final int BUFFER_SIZE = 1024 * 1024;
57+
58+
private final StringBuilder idBuilder = new StringBuilder();
59+
private final PicaParserContext parserContext = new PicaParserContext();
60+
61+
private char[] buffer = new char[BUFFER_SIZE];
62+
private int recordLen;
63+
64+
private boolean ignoreMissingIdn;
65+
private boolean fixUnexpectedEOR;
5766

58-
/**
59-
* For each field in the stream the method calls:
60-
* <ol>
61-
* <li>receiver.startEntity</li>
62-
* <li>receiver.literal for each subfield of the field</li>
63-
* <li>receiver.endEntity</li>
64-
* </ol>
65-
* Fields without any subfield will be skipped.<br>
66-
* <strong>Special handling of subfield 'S':</strong> the code of
67-
* "control subfields" (subfield name='S') will be appended to the
68-
* fieldName. E.g.: 041A $Sa would be mapped to the fieldName 041Aa
69-
*
70-
* @param record
71-
*/
67+
public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
68+
this.ignoreMissingIdn = ignoreMissingIdn;
69+
}
70+
71+
public boolean getIgnoreMissingIdn() {
72+
return ignoreMissingIdn;
73+
}
74+
75+
public void setFixUnexpectedEOR(final boolean fixUnexpectedEOR) {
76+
this.fixUnexpectedEOR = fixUnexpectedEOR;
77+
}
78+
79+
public boolean getFixUnexpectedEOR() {
80+
return fixUnexpectedEOR;
81+
}
82+
83+
public void setNormalizeUTF8(final boolean normalizeUTF8) {
84+
parserContext.setNormalizeUTF8(normalizeUTF8);
85+
}
86+
87+
public boolean getNormalizeUTF8() {
88+
return parserContext.getNormalizeUTF8();
89+
}
90+
91+
public void setSkipEmptyFields(final boolean skipEmptyFields) {
92+
parserContext.setSkipEmptyFields(skipEmptyFields);
93+
}
94+
95+
public boolean getSkipEmptyFields() {
96+
return parserContext.getSkipEmptyFields();
97+
}
98+
7299
@Override
73100
public void process(final String record) {
74101
assert !isClosed();
75-
process(record, getReceiver());
76-
}
102+
103+
copyToBuffer(record);
104+
105+
if (recordIsEmpty()) {
106+
return;
107+
}
108+
109+
String id = extractRecordId();
110+
if (id == null) {
111+
if (!ignoreMissingIdn) {
112+
throw new MissingIdException("Record has no id");
113+
}
114+
id = "";
115+
}
116+
getReceiver().startRecord(id);
77117

78-
public static void setAppendControlSubField(final boolean appendControlSubField) {
79-
PicaDecoder.appendControlSubField = appendControlSubField;
118+
PicaParserState state = PicaParserState.FIELD_START;
119+
for (int i = 0; i < recordLen; ++i) {
120+
state = state.parseChar(buffer[i], parserContext);
121+
}
122+
if (state != PicaParserState.FIELD_START) {
123+
if (fixUnexpectedEOR) {
124+
state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext);
125+
assert state == PicaParserState.FIELD_START;
126+
} else {
127+
throw new FormatException("Unexpected end of record");
128+
}
129+
}
130+
131+
getReceiver().endRecord();
132+
}
133+
134+
@Override
135+
protected void onSetReceiver() {
136+
parserContext.setReceiver(getReceiver());
137+
}
138+
139+
@Override
140+
protected void onResetStream() {
141+
parserContext.reset();
80142
}
81143

82-
public static String extractIdFromRecord(final String record) {
83-
final Matcher idMatcher = ID_PATTERN.matcher(record);
84-
if (idMatcher.find()) {
85-
return idMatcher.group(1);
144+
private void copyToBuffer(final String record) {
145+
recordLen = record.length();
146+
if(recordLen > buffer.length) {
147+
buffer = new char[buffer.length * 2];
86148
}
87-
throw new MissingIdException(record);
149+
record.getChars(0, recordLen, buffer, 0);
88150
}
89151

90-
public static void process(final String rawRecord, final StreamReceiver receiver) {
91-
if (rawRecord.trim().isEmpty()) {
92-
return;
152+
private boolean recordIsEmpty() {
153+
for (int i = 0; i < recordLen; ++i) {
154+
if (buffer[i] != ' ' && buffer[i] != '\t') {
155+
return false;
156+
}
93157
}
158+
return true;
159+
}
160+
161+
private String extractRecordId() {
162+
idBuilder.setLength(0);
94163

95-
final String record = Normalizer.normalize(rawRecord, Form.NFC);
96-
try {
97-
receiver.startRecord(extractIdFromRecord(record));
98-
99-
for (String field : FIELD_PATTERN.split(record)) {
100-
final String[] subfields = SUBFIELD_PATTERN.split(field);
101-
if (subfields.length > 1) {
102-
final String fieldName;
103-
final int firstSubfield;
104-
if (subfields[1].charAt(0) == 'S' && appendControlSubField ) {
105-
fieldName = subfields[0].trim() + subfields[1].charAt(1);
106-
firstSubfield = 2;
164+
int fieldPos = 0;
165+
boolean skip = false;
166+
for (int i = 0; i < recordLen; ++i) {
167+
if (buffer[i] == PicaConstants.FIELD_DELIMITER) {
168+
if (idBuilder.length() > 0) {
169+
return idBuilder.toString();
170+
}
171+
fieldPos = 0;
172+
skip = false;
173+
continue;
174+
}
175+
if (!skip) {
176+
if (fieldPos < ID_FIELD.length) {
177+
if (buffer[i] == ID_FIELD[fieldPos]) {
178+
fieldPos += 1;
107179
} else {
108-
fieldName = subfields[0].trim();
109-
firstSubfield = 1;
180+
skip = true;
110181
}
111-
112-
receiver.startEntity(fieldName);
113-
114-
for (int i = firstSubfield; i < subfields.length; ++i) {
115-
final String subfield = subfields[i];
116-
receiver.literal(subfield.substring(0, 1),
117-
subfield.substring(1));
182+
} else {
183+
if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) {
184+
skip = true;
185+
} else {
186+
idBuilder.append(buffer[i]);
118187
}
119-
receiver.endEntity();
120188
}
121189
}
122-
123-
receiver.endRecord();
124-
} catch (IndexOutOfBoundsException e) {
125-
throw new FormatException(e);
126-
}
190+
}
191+
192+
return null;
127193
}
128194

129195
}

0 commit comments

Comments
 (0)