Skip to content

Commit 9e736df

Browse files
committed
Skip unnamed fields without subfields.
Updated the PicaDecoder to skip fields which have neither a name nor contain any non-empty sub fields. This commit also introduces a new state which simplifies testing if the parser is a valid end state.
1 parent 3c75b41 commit 9e736df

File tree

4 files changed

+97
-20
lines changed

4 files changed

+97
-20
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
* Spaces in the field name are not included in the entity name.
3737
*
3838
* Empty subfields are skipped. For instance, processing the following input
39-
* would NOT produce an empty literal: 003@ \u001f\u001e
39+
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
40+
* skips unnamed fields without any subfields.
4041
*
4142
* If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
4243
* record a {@link MissingIdException} is thrown.
@@ -114,15 +115,14 @@ public void process(final String record) {
114115
}
115116
getReceiver().startRecord(id);
116117

117-
PicaParserState state = PicaParserState.FIELD_NAME;
118+
PicaParserState state = PicaParserState.FIELD_START;
118119
for (int i = 0; i < recordLen; ++i) {
119120
state = state.parseChar(buffer[i], parserContext);
120121
}
121-
if (state != PicaParserState.FIELD_NAME || parserContext.hasUnprocessedText()) {
122+
if (state != PicaParserState.FIELD_START) {
122123
if (fixUnexpectedEOR) {
123124
state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext);
124-
assert state == PicaParserState.FIELD_NAME;
125-
assert !parserContext.hasUnprocessedText();
125+
assert state == PicaParserState.FIELD_START;
126126
} else {
127127
throw new FormatException("Unexpected end of record");
128128
}

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserContext.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,6 @@ public void setReceiver(final StreamReceiver receiver) {
6262
this.receiver = receiver;
6363
}
6464

65-
public boolean hasUnprocessedText() {
66-
return builder.length() != 0;
67-
}
68-
6965
public void reset() {
7066
getTextAndReset();
7167
entityName = null;
@@ -88,7 +84,7 @@ protected void emitStartEntity() {
8884

8985
protected void emitEndEntity() {
9086
if (!literalsEmitted) {
91-
if (skipEmptyFields) {
87+
if (skipEmptyFields || entityName.isEmpty()) {
9288
return;
9389
}
9490
receiver.startEntity(entityName);

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,48 @@
1919
/**
2020
* A parser for PICA+ records. Only single records can be parsed as the parser
2121
* does not recognise end-of-record markers (usually new lines). The initial
22-
* parser state is FIELD_NAME. A valid state for termination is FIELD_NAME with
23-
* no unprocessed text being held in {@link PicaParserContext}. The parser
24-
* processes any input, there is no error state.
22+
* parser state is FIELD_START. A valid state for termination is FIELD_START.
23+
* The parser processes any input, there is no error state.
2524
*
2625
* The parser ignores spaces in field names. They are not included in the
2726
* field name.
2827
*
2928
* Empty subfields are skipped. For instance, parsing the following input
30-
* would NOT produce an empty literal: 003@ \u001f\u001e
29+
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
30+
* skips unnamed fields without any subfields.
3131
*
3232
* @author Christoph Böhme
3333
*
3434
*/
3535
enum PicaParserState {
3636

37+
FIELD_START {
38+
@Override
39+
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
40+
if (ch == PicaConstants.FIELD_DELIMITER || ch == ' ') {
41+
return FIELD_START;
42+
}
43+
return FIELD_NAME.parseChar(ch, ctx);
44+
}
45+
},
3746
FIELD_NAME {
3847
@Override
3948
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
49+
final PicaParserState next;
4050
if (ch == PicaConstants.FIELD_DELIMITER) {
4151
ctx.emitStartEntity();
4252
ctx.emitEndEntity();
53+
next = FIELD_START;
4354
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
4455
ctx.emitStartEntity();
45-
return SUBFIELD_NAME;
46-
} else if (ch != ' ') {
47-
ctx.appendText(ch);
56+
next = SUBFIELD_NAME;
57+
} else {
58+
if (ch != ' ') {
59+
ctx.appendText(ch);
60+
}
61+
next = this;
4862
}
49-
return this;
63+
return next;
5064
}
5165
},
5266
SUBFIELD_NAME {
@@ -55,7 +69,7 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
5569
final PicaParserState next;
5670
if (ch == PicaConstants.FIELD_DELIMITER) {
5771
ctx.emitEndEntity();
58-
next = FIELD_NAME;
72+
next = FIELD_START;
5973
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
6074
next = SUBFIELD_NAME;
6175
} else {
@@ -72,7 +86,7 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
7286
if (ch == PicaConstants.FIELD_DELIMITER) {
7387
ctx.emitLiteral();
7488
ctx.emitEndEntity();
75-
next = FIELD_NAME;
89+
next = FIELD_START;
7690
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
7791
ctx.emitLiteral();
7892
next = SUBFIELD_NAME;

src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package org.culturegraph.mf.stream.converter.bib;
1717

1818
import static org.mockito.Mockito.inOrder;
19+
import static org.mockito.Mockito.verifyNoMoreInteractions;
1920

2021
import org.culturegraph.mf.exceptions.FormatException;
2122
import org.culturegraph.mf.framework.StreamReceiver;
@@ -48,6 +49,9 @@ public final class PicaDecoderTest {
4849
private static final String SUBFIELD_D = SUBFIELD + "dUmberto";
4950
private static final String EMPTY_SUBFIELD = SUBFIELD;
5051
private static final String FIELD_028A_END = FIELD_END;
52+
private static final String EMPTY_FIELD_START = "";
53+
private static final String SUBFIELD_X = SUBFIELD + "Xyz";
54+
private static final String EMPTY_FIELD_END = FIELD_END;
5155

5256
private PicaDecoder picaDecoder;
5357

@@ -87,6 +91,60 @@ public void testShouldParseValidPica() {
8791
ordered.verify(receiver).endRecord();
8892
}
8993

94+
@Test
95+
public void testShouldSkipUnnamedFieldsWithNoSubFields() {
96+
picaDecoder.setSkipEmptyFields(false);
97+
98+
picaDecoder.process(
99+
FIELD_001AT +
100+
FIELD_003AT +
101+
EMPTY_FIELD_START +
102+
EMPTY_FIELD_END);
103+
104+
final InOrder ordered = inOrder(receiver);
105+
ordered.verify(receiver).startRecord(RECORD_ID);
106+
verify001At(ordered);
107+
verify003At(ordered);
108+
ordered.verify(receiver).endRecord();
109+
verifyNoMoreInteractions(receiver);
110+
}
111+
112+
@Test
113+
public void testShouldSkipUnnamedFieldsWithEmptySubFieldsOnly() {
114+
picaDecoder.setSkipEmptyFields(false);
115+
116+
picaDecoder.process(
117+
FIELD_001AT +
118+
FIELD_003AT +
119+
EMPTY_FIELD_START +
120+
EMPTY_SUBFIELD +
121+
EMPTY_FIELD_END);
122+
123+
final InOrder ordered = inOrder(receiver);
124+
ordered.verify(receiver).startRecord(RECORD_ID);
125+
verify001At(ordered);
126+
verify003At(ordered);
127+
ordered.verify(receiver).endRecord();
128+
verifyNoMoreInteractions(receiver);
129+
}
130+
131+
@Test
132+
public void testShouldNotSkipUnnamedFieldsWithSubFields() {
133+
picaDecoder.process(
134+
FIELD_001AT +
135+
FIELD_003AT +
136+
EMPTY_FIELD_START +
137+
SUBFIELD_X +
138+
EMPTY_FIELD_END);
139+
140+
final InOrder ordered = inOrder(receiver);
141+
ordered.verify(receiver).startRecord(RECORD_ID);
142+
verify001At(ordered);
143+
verify003At(ordered);
144+
verifySubfieldX(ordered);
145+
ordered.verify(receiver).endRecord();
146+
}
147+
90148
@Test
91149
public void testShouldSkipEmptySubfields() {
92150
picaDecoder.process(
@@ -105,6 +163,7 @@ public void testShouldSkipEmptySubfields() {
105163
verifySubfieldD(ordered);
106164
verify028AEnd(ordered);
107165
ordered.verify(receiver).endRecord();
166+
verifyNoMoreInteractions(receiver);
108167
}
109168

110169
@Test
@@ -120,6 +179,7 @@ public void testShouldSkipEmptyFieldsByDefault() {
120179
verify001At(ordered);
121180
verify003At(ordered);
122181
ordered.verify(receiver).endRecord();
182+
verifyNoMoreInteractions(receiver);
123183
}
124184

125185
@Test
@@ -155,6 +215,7 @@ public void testShouldSkipEmptyFieldsWithOnlyEmptySubfieldsByDefault() {
155215
verify001At(ordered);
156216
verify003At(ordered);
157217
ordered.verify(receiver).endRecord();
218+
verifyNoMoreInteractions(receiver);
158219
}
159220

160221
@Test
@@ -300,6 +361,12 @@ private void verify028AEnd(final InOrder ordered) {
300361
ordered.verify(receiver).endEntity();
301362
}
302363

364+
private void verifySubfieldX(final InOrder ordered) {
365+
ordered.verify(receiver).startEntity("");
366+
ordered.verify(receiver).literal("X", "yz");
367+
ordered.verify(receiver).endEntity();
368+
}
369+
303370
private void verify021A(final InOrder ordered, final String value) {
304371
ordered.verify(receiver).startEntity("021A");
305372
ordered.verify(receiver).literal("a", value);

0 commit comments

Comments
 (0)