Skip to content

Commit a84bdc5

Browse files
committed
Merge pull request #141 from cboehme/multiline-pica-parser
Multiline pica parser
2 parents b245c80 + bd30086 commit a84bdc5

File tree

4 files changed

+407
-234
lines changed

4 files changed

+407
-234
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaConstants.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
*/
2424
final class PicaConstants {
2525

26-
public static final char FIELD_DELIMITER = '\u001e';
27-
public static final char SUBFIELD_DELIMITER = '\u001f';
26+
public static final char RECORD_MARKER = '\u001d';
27+
public static final char FIELD_MARKER = '\u001e';
28+
public static final char SUBFIELD_MARKER = '\u001f';
29+
public static final char FIELD_END_MARKER = '\n';
2830

2931
private PicaConstants() {
3032
// No instances allowed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 80 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,34 +23,89 @@
2323

2424

2525
/**
26-
* Parses a PICA+ record with UTF8 encoding assumed.
26+
* <p>Parses pica+ records. The parser only parses single records.
27+
* A string containing multiple records must be split into
28+
* individual records before passing it to {@code PicaDecoder}.</p>
29+
*
30+
* <p>The parser is designed to accept any string as valid input and
31+
* to parse pica plain format as well as normalised pica. To
32+
* achieve this, the parser behaves as following:</p>
33+
*
34+
* <ul>
35+
* <li>Fields are separated by record markers (0x1d), field
36+
* markers (0x1e) or field end markers (0x0a).</li>
37+
* <li>The field name and the first subfield are separated by
38+
* a subfield marker (0x01f).</li>
39+
* <li>The parser assumes that the input starts with a field
40+
* name.</li>
41+
* <li>The parser assumes that the end of the input marks
42+
* the end of the current field and the end of the record.
43+
* </li>
44+
* <li>Subfields are separated by subfield markers (0x1f).</li>
45+
* <li>The first character of a subfield is the name of the
46+
* subfield</li>
47+
* <li>To handle input with multiple field and subfield separators
48+
* following each other directly (for instance 0x0a and 0x1e), it
49+
* is assumed that field names, subfields, subfield names or
50+
* subfield values can be empty.</li>
51+
* </ul>
52+
*
53+
* <p>Please not that the record markers is treated as a field
54+
* delimiter and not as a record delimiter. Records need to be
55+
* separated prior to parsing them.</p>
56+
*
57+
* <p>As the behaviour of the parser may result in unnamed fields or
58+
* subfields or fields with no subfields the {@code PicaDecoder}
59+
* automatically filters empty fields and subfields:</p>
60+
*
61+
* <ul>
62+
* <li>Subfields without a name are ignored (such fields cannot
63+
* have any value because then the first character of the value
64+
* would be the field name).</li>
65+
* <li>Subfields which only have a name but no value are always
66+
* parsed.</li>
67+
* <li>Unnamed Fields are only parsed if the contain not-ignored
68+
* subfields.</li>
69+
* <li>Named fields containing none or only ignored subfields are
70+
* only parsed if {@code skipEmptyFields} is set to {@code false}
71+
* otherwise they are ignored.</li>
72+
* <li>Input containing only whitespace (spaces and tabs) is
73+
* completely ignored</li>
74+
* </ul>
75+
*
76+
* <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
77+
* {@code receiver.endEntity} for each parsed field and
78+
* {@code receiver.literal} for each parsed subfield. Spaces in the
79+
* field name are not included in the entity name. The input
80+
* "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
81+
* sequence of calls:</p>
2782
*
28-
* For each field in the stream the module calls:
2983
* <ol>
30-
* <li>receiver.startEntity</li>
31-
* <li>receiver.literal for each subfield of the field</li>
32-
* <li>receiver.endEntity</li>
84+
* <li>receiver.startEntity("028A")</li>
85+
* <li>receiver.literal("a", "Andy")</li>
86+
* <li>receiver.literal("d", "Warhol")</li>
87+
* <li>receiver.endEntity()</li>
3388
* </ol>
3489
*
35-
* Spaces in the field name are not included in the entity name.
36-
*
37-
* Empty subfields are skipped. For instance, processing the following input
38-
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
39-
* skips unnamed fields without any subfields.
90+
* <p>The content of subfield 003@$0 is used for the record id. If
91+
* {@code ignoreMissingIdn} is false and field 003@$0 is not found
92+
* in the record a {@link MissingIdException} is thrown.</p>
4093
*
41-
* If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
42-
* record a {@link MissingIdException} is thrown.
94+
* <p>The parser assumes that the input is utf-8 encoded. The parser
95+
* does not support other pica encodings.</p>
4396
*
4497
* @author Christoph Böhme
4598
*
4699
*/
47-
@Description("Parses a PICA+ record with UTF8 encoding assumed.")
100+
@Description("Parses pica+ records. The parser only parses single records. " +
101+
"A string containing multiple records must be split into " +
102+
"individual records before passing it to PicaDecoder.")
48103
@In(String.class)
49104
@Out(StreamReceiver.class)
50105
public final class PicaDecoder
51106
extends DefaultObjectPipe<String, StreamReceiver> {
52107

53-
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'};
108+
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};
54109

55110
private static final int BUFFER_SIZE = 1024 * 1024;
56111

@@ -144,10 +199,9 @@ private boolean recordIsEmpty() {
144199
/**
145200
* Searches the record for the sequence specified in {@code ID_FIELD}
146201
* and returns all characters following this sequence until the next
147-
* {@link PicaConstants.FIELD_DELIMITER},
148-
* {@link PicaConstants.SUBFIELD_DELIMITER} or the end of the record
149-
* is reached. Only the first occurrence of the sequence is processed,
150-
* later occurrences are ignored.
202+
* control character (see {@link PicaConstants}) is found or the end of
203+
* the record is reached. Only the first occurrence of the sequence is
204+
* processed, later occurrences are ignored.
151205
*
152206
* If the sequence is not found in the string or if it is not followed
153207
* by any characters then {@code null} is returned.
@@ -161,7 +215,7 @@ private String extractRecordId() {
161215
int fieldPos = 0;
162216
boolean skip = false;
163217
for (int i = 0; i < recordLen; ++i) {
164-
if (buffer[i] == PicaConstants.FIELD_DELIMITER) {
218+
if (isFieldDelimiter(buffer[i])) {
165219
if (idBuilder.length() > 0) {
166220
break;
167221
}
@@ -176,7 +230,7 @@ private String extractRecordId() {
176230
skip = true;
177231
}
178232
} else {
179-
if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) {
233+
if (buffer[i] == PicaConstants.SUBFIELD_MARKER) {
180234
break;
181235
}
182236
idBuilder.append(buffer[i]);
@@ -191,4 +245,10 @@ private String extractRecordId() {
191245
return null;
192246
}
193247

248+
private static boolean isFieldDelimiter(final char ch) {
249+
return ch == PicaConstants.RECORD_MARKER
250+
|| ch == PicaConstants.FIELD_MARKER
251+
|| ch == PicaConstants.FIELD_END_MARKER;
252+
}
253+
194254
}

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818

1919
/**
2020
* A parser for PICA+ records. Only single records can be parsed as the parser
21-
* does not recognise end-of-record markers (usually new lines). The initial
22-
* parser state is FIELD_NAME. All states are valid end states. The parser
23-
* processes any input, there is no error state.
21+
* ignores end of record markers. The initial parser state is FIELD_NAME. All
22+
* states are valid end states. The parser processes any input, there is no
23+
* error state.
2424
*
2525
* The parser ignores spaces in field names. They are not included in the
2626
* field name.
@@ -38,14 +38,19 @@ enum PicaParserState {
3838
@Override
3939
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
4040
final PicaParserState next;
41-
if (ch == PicaConstants.FIELD_DELIMITER) {
41+
switch (ch) {
42+
case PicaConstants.RECORD_MARKER:
43+
case PicaConstants.FIELD_MARKER:
44+
case PicaConstants.FIELD_END_MARKER:
4245
ctx.emitStartEntity();
4346
ctx.emitEndEntity();
4447
next = FIELD_NAME;
45-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
48+
break;
49+
case PicaConstants.SUBFIELD_MARKER:
4650
ctx.emitStartEntity();
4751
next = SUBFIELD_NAME;
48-
} else {
52+
break;
53+
default:
4954
if (ch != ' ') {
5055
ctx.appendText(ch);
5156
}
@@ -64,12 +69,17 @@ protected void endOfInput(final PicaParserContext ctx) {
6469
@Override
6570
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
6671
final PicaParserState next;
67-
if (ch == PicaConstants.FIELD_DELIMITER) {
72+
switch (ch) {
73+
case PicaConstants.RECORD_MARKER:
74+
case PicaConstants.FIELD_MARKER:
75+
case PicaConstants.FIELD_END_MARKER:
6876
ctx.emitEndEntity();
6977
next = FIELD_NAME;
70-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
78+
break;
79+
case PicaConstants.SUBFIELD_MARKER:
7180
next = this;
72-
} else {
81+
break;
82+
default:
7383
ctx.setSubfieldName(ch);
7484
next = SUBFIELD_VALUE;
7585
}
@@ -85,14 +95,19 @@ protected void endOfInput(final PicaParserContext ctx) {
8595
@Override
8696
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
8797
final PicaParserState next;
88-
if (ch == PicaConstants.FIELD_DELIMITER) {
98+
switch (ch) {
99+
case PicaConstants.RECORD_MARKER:
100+
case PicaConstants.FIELD_MARKER:
101+
case PicaConstants.FIELD_END_MARKER:
89102
ctx.emitLiteral();
90103
ctx.emitEndEntity();
91104
next = FIELD_NAME;
92-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
105+
break;
106+
case PicaConstants.SUBFIELD_MARKER:
93107
ctx.emitLiteral();
94108
next = SUBFIELD_NAME;
95-
} else {
109+
break;
110+
default:
96111
ctx.appendText(ch);
97112
next = this;
98113
}

0 commit comments

Comments
 (0)