Skip to content

Commit c0eeb04

Browse files
committed
Add support for multiline pica records.
The pica parser accepts new lines (\x0a) in the input as a field delimiter. This makes it possible to parse not only pica plain but also multi line pica records. The test cases were refactored to make the input being tested more visible.
1 parent ae5a08a commit c0eeb04

File tree

4 files changed

+259
-213
lines changed

4 files changed

+259
-213
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaConstants.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
*/
2424
final class PicaConstants {
2525

26-
public static final char FIELD_DELIMITER = '\u001e';
27-
public static final char SUBFIELD_DELIMITER = '\u001f';
26+
public static final char RECORD_MARKER = '\u001d';
27+
public static final char FIELD_MARKER = '\u001e';
28+
public static final char SUBFIELD_MARKER = '\u001f';
29+
public static final char FIELD_END_MARKER = '\n';
2830

2931
private PicaConstants() {
3032
// No instances allowed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
public final class PicaDecoder
5151
extends DefaultObjectPipe<String, StreamReceiver> {
5252

53-
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'};
53+
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};
5454

5555
private static final int BUFFER_SIZE = 1024 * 1024;
5656

@@ -144,10 +144,10 @@ private boolean recordIsEmpty() {
144144
/**
145145
* Searches the record for the sequence specified in {@code ID_FIELD}
146146
* and returns all characters following this sequence until the next
147-
* {@link PicaConstants.FIELD_DELIMITER},
148-
* {@link PicaConstants.SUBFIELD_DELIMITER} or the end of the record
149-
* is reached. Only the first occurrence of the sequence is processed,
150-
* later occurrences are ignored.
147+
* {@link PicaConstants.FIELD_MARKER},
148+
* {@link PicaConstants.SUBFIELD_MARKER}, {@link PicaConstants.LINE_END_MARKER}
149+
* or the end of the record is reached. Only the first occurrence of the
150+
* sequence is processed, later occurrences are ignored.
151151
*
152152
* If the sequence is not found in the string or if it is not followed
153153
* by any characters then {@code null} is returned.
@@ -161,7 +161,8 @@ private String extractRecordId() {
161161
int fieldPos = 0;
162162
boolean skip = false;
163163
for (int i = 0; i < recordLen; ++i) {
164-
if (buffer[i] == PicaConstants.FIELD_DELIMITER) {
164+
if (buffer[i] == PicaConstants.FIELD_MARKER
165+
|| buffer[i] == PicaConstants.FIELD_END_MARKER) {
165166
if (idBuilder.length() > 0) {
166167
break;
167168
}
@@ -176,7 +177,7 @@ private String extractRecordId() {
176177
skip = true;
177178
}
178179
} else {
179-
if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) {
180+
if (buffer[i] == PicaConstants.SUBFIELD_MARKER) {
180181
break;
181182
}
182183
idBuilder.append(buffer[i]);

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,18 @@ enum PicaParserState {
3838
@Override
3939
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
4040
final PicaParserState next;
41-
if (ch == PicaConstants.FIELD_DELIMITER) {
41+
switch (ch) {
42+
case PicaConstants.FIELD_MARKER:
43+
case PicaConstants.FIELD_END_MARKER:
4244
ctx.emitStartEntity();
4345
ctx.emitEndEntity();
4446
next = FIELD_NAME;
45-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
47+
break;
48+
case PicaConstants.SUBFIELD_MARKER:
4649
ctx.emitStartEntity();
4750
next = SUBFIELD_NAME;
48-
} else {
51+
break;
52+
default:
4953
if (ch != ' ') {
5054
ctx.appendText(ch);
5155
}
@@ -64,12 +68,16 @@ protected void endOfInput(final PicaParserContext ctx) {
6468
@Override
6569
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
6670
final PicaParserState next;
67-
if (ch == PicaConstants.FIELD_DELIMITER) {
71+
switch (ch) {
72+
case PicaConstants.FIELD_MARKER:
73+
case PicaConstants.FIELD_END_MARKER:
6874
ctx.emitEndEntity();
6975
next = FIELD_NAME;
70-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
76+
break;
77+
case PicaConstants.SUBFIELD_MARKER:
7178
next = this;
72-
} else {
79+
break;
80+
default:
7381
ctx.setSubfieldName(ch);
7482
next = SUBFIELD_VALUE;
7583
}
@@ -85,14 +93,18 @@ protected void endOfInput(final PicaParserContext ctx) {
8593
@Override
8694
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
8795
final PicaParserState next;
88-
if (ch == PicaConstants.FIELD_DELIMITER) {
96+
switch (ch) {
97+
case PicaConstants.FIELD_MARKER:
98+
case PicaConstants.FIELD_END_MARKER:
8999
ctx.emitLiteral();
90100
ctx.emitEndEntity();
91101
next = FIELD_NAME;
92-
} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
102+
break;
103+
case PicaConstants.SUBFIELD_MARKER:
93104
ctx.emitLiteral();
94105
next = SUBFIELD_NAME;
95-
} else {
106+
break;
107+
default:
96108
ctx.appendText(ch);
97109
next = this;
98110
}

0 commit comments

Comments
 (0)