Skip to content

Commit ec81279

Browse files
committed
Handles record markers in pica records gracefully (fixes #139).
Modified the pica parser to behave as if a field marker was encountered when a record marker is encountered. Strictly speaking this is not correct behaviour but the PicaDecoder expects single records in the first place. Consequently a record marker should only be found at the beginning or the end of the input.
1 parent c0eeb04 commit ec81279

File tree

3 files changed

+131
-59
lines changed

3 files changed

+131
-59
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,9 @@ private boolean recordIsEmpty() {
144144
/**
145145
* Searches the record for the sequence specified in {@code ID_FIELD}
146146
* and returns all characters following this sequence until the next
147-
* {@link PicaConstants.FIELD_MARKER},
148-
* {@link PicaConstants.SUBFIELD_MARKER}, {@link PicaConstants.LINE_END_MARKER}
149-
* or the end of the record is reached. Only the first occurrence of the
150-
* sequence is processed, later occurrences are ignored.
147+
* control character (see {@link PicaConstants} is found or the end of
148+
* the record is reached. Only the first occurrence of the sequence is
149+
* processed, later occurrences are ignored.
151150
*
152151
* If the sequence is not found in the string or if it is not followed
153152
* by any characters then {@code null} is returned.
@@ -161,8 +160,7 @@ private String extractRecordId() {
161160
int fieldPos = 0;
162161
boolean skip = false;
163162
for (int i = 0; i < recordLen; ++i) {
164-
if (buffer[i] == PicaConstants.FIELD_MARKER
165-
|| buffer[i] == PicaConstants.FIELD_END_MARKER) {
163+
if (isFieldDelimiter(buffer[i])) {
166164
if (idBuilder.length() > 0) {
167165
break;
168166
}
@@ -192,4 +190,10 @@ private String extractRecordId() {
192190
return null;
193191
}
194192

193+
private static boolean isFieldDelimiter(final char ch) {
194+
return ch == PicaConstants.RECORD_MARKER
195+
|| ch == PicaConstants.FIELD_MARKER
196+
|| ch == PicaConstants.FIELD_END_MARKER;
197+
}
198+
195199
}

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818

1919
/**
2020
* A parser for PICA+ records. Only single records can be parsed as the parser
21-
* does not recognise end-of-record markers (usually new lines). The initial
22-
* parser state is FIELD_NAME. All states are valid end states. The parser
23-
* processes any input, there is no error state.
21+
* ignores end of record markers. The initial parser state is FIELD_NAME. All
22+
* states are valid end states. The parser processes any input, there is no
23+
* error state.
2424
*
2525
* The parser ignores spaces in field names. They are not included in the
2626
* field name.
@@ -39,6 +39,7 @@ enum PicaParserState {
3939
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
4040
final PicaParserState next;
4141
switch (ch) {
42+
case PicaConstants.RECORD_MARKER:
4243
case PicaConstants.FIELD_MARKER:
4344
case PicaConstants.FIELD_END_MARKER:
4445
ctx.emitStartEntity();
@@ -69,6 +70,7 @@ protected void endOfInput(final PicaParserContext ctx) {
6970
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
7071
final PicaParserState next;
7172
switch (ch) {
73+
case PicaConstants.RECORD_MARKER:
7274
case PicaConstants.FIELD_MARKER:
7375
case PicaConstants.FIELD_END_MARKER:
7476
ctx.emitEndEntity();
@@ -94,6 +96,7 @@ protected void endOfInput(final PicaParserContext ctx) {
9496
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
9597
final PicaParserState next;
9698
switch (ch) {
99+
case PicaConstants.RECORD_MARKER:
97100
case PicaConstants.FIELD_MARKER:
98101
case PicaConstants.FIELD_END_MARKER:
99102
ctx.emitLiteral();

src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java

Lines changed: 115 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ public final class PicaDecoderTest {
4242
private static final String VALUE_D = "Umberto";
4343
private static final String COMPOSED_UTF8 = "Über"; // 'Ü' constructed from U and diacritics
4444
private static final String STANDARD_UTF8 = "Über"; // 'Ü' is a single character
45-
46-
private static final String SUBFIELD_MARKER = "\u001f";
45+
46+
private static final String RECORD_MARKER = "\u001d";
4747
private static final String FIELD_MARKER = "\u001e";
48+
private static final String SUBFIELD_MARKER = "\u001f";
4849
private static final String FIELD_END_MARKER = "\n";
4950

5051
private static final String FIELD_001AT_0_TEST = "001@ " + SUBFIELD_MARKER + "0test";
@@ -70,7 +71,103 @@ public void cleanup() {
7071
}
7172

7273
@Test
73-
public void testShouldParseRecordStartingWithFieldNameAndEndingWithFieldMarker() {
74+
public void testShouldParseRecordStartingWithRecordMarker() {
75+
picaDecoder.process(
76+
RECORD_MARKER + FIELD_001AT_0_TEST +
77+
FIELD_MARKER + FIELD_003AT_0_ID);
78+
79+
80+
final InOrder ordered = inOrder(receiver);
81+
ordered.verify(receiver).startRecord(RECORD_ID);
82+
verify001At0Test(ordered);
83+
verify003At0ID(ordered);
84+
ordered.verify(receiver).endRecord();
85+
}
86+
87+
@Test
88+
public void testShouldParseRecordStartingWithFieldMarker() {
89+
picaDecoder.process(
90+
FIELD_MARKER + FIELD_001AT_0_TEST +
91+
FIELD_MARKER + FIELD_003AT_0_ID);
92+
93+
94+
final InOrder ordered = inOrder(receiver);
95+
ordered.verify(receiver).startRecord(RECORD_ID);
96+
verify001At0Test(ordered);
97+
verify003At0ID(ordered);
98+
ordered.verify(receiver).endRecord();
99+
}
100+
101+
@Test
102+
public void testShouldParseRecordStartingWithSubfieldMarker() {
103+
picaDecoder.process(
104+
SUBFIELD_MARKER + NAME_A + VALUE_A +
105+
FIELD_MARKER + FIELD_003AT_0_ID);
106+
107+
final InOrder ordered = inOrder(receiver);
108+
ordered.verify(receiver).startRecord(RECORD_ID);
109+
ordered.verify(receiver).startEntity("");
110+
ordered.verify(receiver).literal(NAME_A, VALUE_A);
111+
ordered.verify(receiver).endEntity();
112+
verify003At0ID(ordered);
113+
ordered.verify(receiver).endRecord();
114+
}
115+
116+
@Test
117+
public void testShouldParseRecordStartingWithEmptySubfield() {
118+
picaDecoder.process(
119+
SUBFIELD_MARKER +
120+
FIELD_MARKER + FIELD_003AT_0_ID);
121+
122+
final InOrder ordered = inOrder(receiver);
123+
ordered.verify(receiver).startRecord(RECORD_ID);
124+
verify003At0ID(ordered);
125+
ordered.verify(receiver).endRecord();
126+
}
127+
128+
@Test
129+
public void testShouldParseRecordStartingWithFieldEndMarker() {
130+
picaDecoder.process(
131+
FIELD_END_MARKER + FIELD_001AT_0_TEST +
132+
FIELD_MARKER + FIELD_003AT_0_ID);
133+
134+
135+
final InOrder ordered = inOrder(receiver);
136+
ordered.verify(receiver).startRecord(RECORD_ID);
137+
verify001At0Test(ordered);
138+
verify003At0ID(ordered);
139+
ordered.verify(receiver).endRecord();
140+
}
141+
142+
@Test
143+
public void testShouldParseRecordStartingWithFieldName() {
144+
picaDecoder.process(
145+
FIELD_001AT_0_TEST +
146+
FIELD_MARKER + FIELD_003AT_0_ID);
147+
148+
149+
final InOrder ordered = inOrder(receiver);
150+
ordered.verify(receiver).startRecord(RECORD_ID);
151+
verify001At0Test(ordered);
152+
verify003At0ID(ordered);
153+
ordered.verify(receiver).endRecord();
154+
}
155+
156+
@Test
157+
public void testShouldParseRecordEndingWithRecordMarker() {
158+
picaDecoder.process(
159+
FIELD_003AT_0_ID + FIELD_MARKER +
160+
FIELD_001AT_0_TEST + RECORD_MARKER);
161+
162+
final InOrder ordered = inOrder(receiver);
163+
ordered.verify(receiver).startRecord(RECORD_ID);
164+
verify003At0ID(ordered);
165+
verify001At0Test(ordered);
166+
ordered.verify(receiver).endRecord();
167+
}
168+
169+
@Test
170+
public void testShouldParseRecordEndingWithFieldMarker() {
74171
picaDecoder.process(
75172
FIELD_003AT_0_ID + FIELD_MARKER +
76173
FIELD_001AT_0_TEST + FIELD_MARKER);
@@ -138,38 +235,33 @@ public void testShouldParseRecordEndingWithFieldName() {
138235
}
139236

140237
@Test
141-
public void testShouldParseRecordStartingWithFieldMarker() {
238+
public void testShouldParseMultiLineRecordFormat() {
142239
picaDecoder.process(
143-
FIELD_MARKER + FIELD_001AT_0_TEST +
144-
FIELD_MARKER + FIELD_003AT_0_ID);
240+
RECORD_MARKER + FIELD_END_MARKER +
241+
FIELD_MARKER + FIELD_001AT_0_TEST + FIELD_END_MARKER +
242+
FIELD_MARKER + FIELD_003AT_0_ID + FIELD_END_MARKER);
145243

146-
147244
final InOrder ordered = inOrder(receiver);
148245
ordered.verify(receiver).startRecord(RECORD_ID);
149246
verify001At0Test(ordered);
150247
verify003At0ID(ordered);
151248
ordered.verify(receiver).endRecord();
152249
}
153-
250+
154251
@Test
155-
public void testShouldParseRecordStartingWithSubfieldMarker() {
252+
public void testShouldParseRecordIdAfterRecordMarker() {
156253
picaDecoder.process(
157-
SUBFIELD_MARKER + NAME_A + VALUE_A +
158-
FIELD_MARKER + FIELD_003AT_0_ID);
159-
254+
RECORD_MARKER + FIELD_003AT_0_ID);
255+
160256
final InOrder ordered = inOrder(receiver);
161257
ordered.verify(receiver).startRecord(RECORD_ID);
162-
ordered.verify(receiver).startEntity("");
163-
ordered.verify(receiver).literal(NAME_A, VALUE_A);
164-
ordered.verify(receiver).endEntity();
165258
verify003At0ID(ordered);
166259
ordered.verify(receiver).endRecord();
167260
}
168-
261+
169262
@Test
170-
public void testShouldParseRecordStartingWithEmptySubfield() {
263+
public void testShouldParseRecordIdAfterFieldMarker() {
171264
picaDecoder.process(
172-
SUBFIELD_MARKER +
173265
FIELD_MARKER + FIELD_003AT_0_ID);
174266

175267
final InOrder ordered = inOrder(receiver);
@@ -179,28 +271,23 @@ public void testShouldParseRecordStartingWithEmptySubfield() {
179271
}
180272

181273
@Test
182-
public void testShouldParseRecordStartingWithFieldEndMarker() {
274+
public void testShouldParseRecordIdAfterFieldEndMarker() {
183275
picaDecoder.process(
184-
FIELD_END_MARKER + FIELD_001AT_0_TEST +
185-
FIELD_MARKER + FIELD_003AT_0_ID);
276+
FIELD_END_MARKER + FIELD_003AT_0_ID);
186277

187-
188278
final InOrder ordered = inOrder(receiver);
189279
ordered.verify(receiver).startRecord(RECORD_ID);
190-
verify001At0Test(ordered);
191280
verify003At0ID(ordered);
192281
ordered.verify(receiver).endRecord();
193282
}
194-
283+
195284
@Test
196-
public void testShouldParseRecordWitFieldEndMarkers() {
285+
public void testShouldParseRecordIdFollowedByRecordMarker() {
197286
picaDecoder.process(
198-
FIELD_MARKER + FIELD_001AT_0_TEST + FIELD_END_MARKER +
199-
FIELD_MARKER + FIELD_003AT_0_ID + FIELD_END_MARKER);
287+
FIELD_003AT_0_ID + RECORD_MARKER);
200288

201289
final InOrder ordered = inOrder(receiver);
202290
ordered.verify(receiver).startRecord(RECORD_ID);
203-
verify001At0Test(ordered);
204291
verify003At0ID(ordered);
205292
ordered.verify(receiver).endRecord();
206293
}
@@ -248,28 +335,6 @@ public void testShouldParseRecordIdAtRecordEnd() {
248335
verify003At0ID(ordered);
249336
ordered.verify(receiver).endRecord();
250337
}
251-
252-
@Test
253-
public void testShouldParseRecordIdAfterFieldMarker() {
254-
picaDecoder.process(
255-
FIELD_MARKER + FIELD_003AT_0_ID);
256-
257-
final InOrder ordered = inOrder(receiver);
258-
ordered.verify(receiver).startRecord(RECORD_ID);
259-
verify003At0ID(ordered);
260-
ordered.verify(receiver).endRecord();
261-
}
262-
263-
@Test
264-
public void testShouldParseRecordIdAfterFieldEndMarker() {
265-
picaDecoder.process(
266-
FIELD_END_MARKER + FIELD_003AT_0_ID);
267-
268-
final InOrder ordered = inOrder(receiver);
269-
ordered.verify(receiver).startRecord(RECORD_ID);
270-
verify003At0ID(ordered);
271-
ordered.verify(receiver).endRecord();
272-
}
273338

274339
@Test
275340
public void testShouldSkipUnnamedFieldsWithNoSubFields() {

0 commit comments

Comments
 (0)