Skip to content

Commit ae5a08a

Browse files
committed
Fix #137: Refactored the pica parser to be more lenient.
The parser does expect fields and subfields to end with a field or subfield marker anymore. This should allow parsing of pica serialisation variants which use the markers as delimiters as well as variants which use the markers as indicators. The fixUnexpectedEOR option was removed as there is no unexpected end of record anymore.
1 parent 89119a6 commit ae5a08a

File tree

3 files changed

+141
-130
lines changed

3 files changed

+141
-130
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
*/
1616
package org.culturegraph.mf.stream.converter.bib;
1717

18-
import org.culturegraph.mf.exceptions.FormatException;
1918
import org.culturegraph.mf.framework.DefaultObjectPipe;
2019
import org.culturegraph.mf.framework.StreamReceiver;
2120
import org.culturegraph.mf.framework.annotations.Description;
@@ -62,7 +61,6 @@ public final class PicaDecoder
6261
private int recordLen;
6362

6463
private boolean ignoreMissingIdn;
65-
private boolean fixUnexpectedEOR;
6664

6765
public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
6866
this.ignoreMissingIdn = ignoreMissingIdn;
@@ -72,14 +70,6 @@ public boolean getIgnoreMissingIdn() {
7270
return ignoreMissingIdn;
7371
}
7472

75-
public void setFixUnexpectedEOR(final boolean fixUnexpectedEOR) {
76-
this.fixUnexpectedEOR = fixUnexpectedEOR;
77-
}
78-
79-
public boolean getFixUnexpectedEOR() {
80-
return fixUnexpectedEOR;
81-
}
82-
8373
public void setNormalizeUTF8(final boolean normalizeUTF8) {
8474
parserContext.setNormalizeUTF8(normalizeUTF8);
8575
}
@@ -115,18 +105,11 @@ public void process(final String record) {
115105
}
116106
getReceiver().startRecord(id);
117107

118-
PicaParserState state = PicaParserState.FIELD_START;
108+
PicaParserState state = PicaParserState.FIELD_NAME;
119109
for (int i = 0; i < recordLen; ++i) {
120110
state = state.parseChar(buffer[i], parserContext);
121111
}
122-
if (state != PicaParserState.FIELD_START) {
123-
if (fixUnexpectedEOR) {
124-
state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext);
125-
assert state == PicaParserState.FIELD_START;
126-
} else {
127-
throw new FormatException("Unexpected end of record");
128-
}
129-
}
112+
state.endOfInput(parserContext);
130113

131114
getReceiver().endRecord();
132115
}

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
/**
2020
* A parser for PICA+ records. Only single records can be parsed as the parser
2121
* does not recognise end-of-record markers (usually new lines). The initial
22-
* parser state is FIELD_START. A valid state for termination is FIELD_START.
23-
* The parser processes any input, there is no error state.
22+
* parser state is FIELD_NAME. All states are valid end states. The parser
23+
* processes any input, there is no error state.
2424
*
2525
* The parser ignores spaces in field names. They are not included in the
2626
* field name.
@@ -34,23 +34,14 @@
3434
*/
3535
enum PicaParserState {
3636

37-
FIELD_START {
38-
@Override
39-
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
40-
if (ch == PicaConstants.FIELD_DELIMITER || ch == ' ') {
41-
return FIELD_START;
42-
}
43-
return FIELD_NAME.parseChar(ch, ctx);
44-
}
45-
},
4637
FIELD_NAME {
4738
@Override
4839
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
4940
final PicaParserState next;
5041
if (ch == PicaConstants.FIELD_DELIMITER) {
5142
ctx.emitStartEntity();
5243
ctx.emitEndEntity();
53-
next = FIELD_START;
44+
next = FIELD_NAME;
5445
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
5546
ctx.emitStartEntity();
5647
next = SUBFIELD_NAME;
@@ -62,22 +53,33 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
6253
}
6354
return next;
6455
}
56+
57+
@Override
58+
protected void endOfInput(final PicaParserContext ctx) {
59+
ctx.emitStartEntity();
60+
ctx.emitEndEntity();
61+
}
6562
},
6663
SUBFIELD_NAME {
6764
@Override
6865
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
6966
final PicaParserState next;
7067
if (ch == PicaConstants.FIELD_DELIMITER) {
7168
ctx.emitEndEntity();
72-
next = FIELD_START;
69+
next = FIELD_NAME;
7370
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
74-
next = SUBFIELD_NAME;
71+
next = this;
7572
} else {
7673
ctx.setSubfieldName(ch);
7774
next = SUBFIELD_VALUE;
7875
}
7976
return next;
8077
}
78+
79+
@Override
80+
protected void endOfInput(final PicaParserContext ctx) {
81+
ctx.emitEndEntity();
82+
}
8183
},
8284
SUBFIELD_VALUE {
8385
@Override
@@ -86,7 +88,7 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
8688
if (ch == PicaConstants.FIELD_DELIMITER) {
8789
ctx.emitLiteral();
8890
ctx.emitEndEntity();
89-
next = FIELD_START;
91+
next = FIELD_NAME;
9092
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
9193
ctx.emitLiteral();
9294
next = SUBFIELD_NAME;
@@ -96,8 +98,16 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
9698
}
9799
return next;
98100
}
101+
102+
@Override
103+
protected void endOfInput(final PicaParserContext ctx) {
104+
ctx.emitLiteral();
105+
ctx.emitEndEntity();
106+
}
99107
};
100108

101109
protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
102110

111+
protected abstract void endOfInput(final PicaParserContext ctx);
112+
103113
}

0 commit comments

Comments
 (0)