Skip to content

Commit 25b125f

Browse files
committed
Merge branch '296-handleNonNormalizedPica+'
2 parents 1f3f5a5 + 9c28a07 commit 25b125f

File tree

9 files changed

+1260
-56
lines changed

9 files changed

+1260
-56
lines changed
Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
/*
2-
* Copyright 2016 Christoph Böhme
1+
/* Copyright 2016,2019 Christoph Böhme and others
32
*
43
* Licensed under the Apache License, Version 2.0 the "License";
54
* you may not use this file except in compliance with the License.
@@ -13,23 +12,45 @@
1312
* See the License for the specific language governing permissions and
1413
* limitations under the License.
1514
*/
15+
1616
package org.metafacture.biblio.pica;
1717

1818
/**
19-
* Useful constants for PICA+
19+
* Useful constants for PICA+.
20+
* PICA+ comes with two possible serializations:
21+
* a normalized one and a non-normalized.
2022
*
21-
* @author Christoph Böhme
23+
* @author Christoph Böhme (initial implementation)
24+
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
25+
* @author Fabian Steeg (fsteeg) (switch to enum)
2226
*
2327
*/
24-
final class PicaConstants {
28+
enum PicaConstants {
29+
// We use '\0' for null/empty
30+
RECORD_MARKER('\u001d', '\n'), //
31+
FIELD_MARKER('\u001e', '\0'), //
32+
SUBFIELD_MARKER('\u001f', '$'), //
33+
FIELD_END_MARKER('\n', '\n'), //
34+
NO_MARKER('\0', '\0');
35+
36+
char normalized;
37+
char nonNormalized;
2538

26-
public static final char RECORD_MARKER = '\u001d';
27-
public static final char FIELD_MARKER = '\u001e';
28-
public static final char SUBFIELD_MARKER = '\u001f';
29-
public static final char FIELD_END_MARKER = '\n';
39+
PicaConstants(char normalized, char nonNormalized) {
40+
this.normalized = normalized;
41+
this.nonNormalized = nonNormalized;
42+
}
3043

31-
private PicaConstants() {
32-
// No instances allowed
33-
}
44+
public char get(boolean isNormalized) {
45+
return isNormalized ? normalized : nonNormalized;
46+
}
3447

35-
}
48+
public static PicaConstants from(boolean isNormalized, char ch) {
49+
for (PicaConstants value : values()) {
50+
if (ch == (isNormalized ? value.normalized : value.nonNormalized)) {
51+
return value;
52+
}
53+
}
54+
return NO_MARKER;
55+
}
56+
}

metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaDecoder.java

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2016 Christoph Böhme
2+
* Copyright 2016, 2019 Christoph Böhme and others
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -32,9 +32,11 @@
3232
* containing multiple records must be split into individual records before
3333
* passing it to {@code PicaDecoder}.
3434
* <p>
35-
* The parser is designed to accept any string as valid input and to parse pica
36-
* plain format as well as normalised pica. To achieve this, the parser behaves
37-
* as following:
35+
* The parser is designed to accept any string as valid input and to parse
36+
* pica+ in its two serialization forms:
37+
* as non-normalized and as normalized.
38+
* To achieve this, the parser behaves as following when parsing:
39+
* normalized pica+:
3840
* <ul>
3941
* <li>The parser assumes that the input starts with a field name.
4042
*
@@ -56,6 +58,26 @@
5658
* that field names, subfields, subfield names or subfield values can be
5759
* empty.
5860
* </ul>
61+
* * non-normalized pica+:
62+
* <ul>
63+
* <li>The parser assumes that the input starts with a field name.
64+
*
65+
* <li>The field name and the first subfield are separated by a subfield
66+
* marker ($).
67+
*
68+
* <li>Fields are separated by record markers (&#92;n) or field end
69+
* markers (&#92;n).
70+
*
71+
* <li>Subfields are separated by subfield markers ($).
72+
*
73+
* <li>The first character of a subfield is the name of the subfield
74+
*
75+
* <li>The parser assumes that the end of the input marks the end of the
76+
* current field and the end of the record.
77+
*
78+
* <li>As multiple fields and subfields are not empty in non-normailzed pica+
79+
* they are just treated like anything else.
80+
* </ul>
5981
* Please note that the record marker is treated as a field delimiter and not
6082
* as a record delimiter. Records need to be separated prior to parsing them.
6183
* <p>
@@ -69,7 +91,8 @@
6991
*
7092
* <li>Subfields which only have a name but no value are always parsed.
7193
*
72-
* <li>Unnamed fields are only parsed if the contain not-ignored subfields.
94+
* <li>In normalized pica+ unnamed fields are only parsed if they contain
95+
* not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
7396
*
7497
* <li>Named fields containing none or only ignored subfields are only parsed
7598
* if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
@@ -85,7 +108,7 @@
85108
* {@link #setTrimFieldNames(boolean)} to false.
86109
* <p>
87110
* The record id emitted with the <i>start-record</i> event is extracted from
88-
* one of the following pica fields:
111+
* one of the following non-normalized pica+ fields:
89112
* <ul>
90113
* <li><i>003&#64; $0</i>
91114
* <li><i>107F $0</i>
@@ -97,7 +120,7 @@
97120
* found in the record a {@link MissingIdException} is thrown otherwise the
98121
* record identifier is an empty string.
99122
* <p>
100-
* For example, when run on the input
123+
* For example, when run on this input in its normalized serialization form:
101124
* <pre>
102125
* 003&#64; &#92;u001f01234&#92;u001e
103126
* 028A &#92;u001faAndy&#92;u001fdWarhol&#92;u001e
@@ -120,6 +143,8 @@
120143
* support other pica encodings.
121144
*
122145
* @author Christoph Böhme
146+
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
147+
* @author Fabian Steeg (fsteeg) (switch to enum)
123148
*
124149
*/
125150
@Description("Parses pica+ records. The parser only parses single records. " +
@@ -131,22 +156,50 @@
131156
public final class PicaDecoder
132157
extends DefaultObjectPipe<String, StreamReceiver> {
133158

134-
private static final String START_MARKERS ="(?:^|" + PicaConstants.FIELD_MARKER +
135-
"|" + PicaConstants.FIELD_END_MARKER + "|" + PicaConstants.RECORD_MARKER + ")";
136-
private static final Pattern ID_FIELDS_PATTERN = Pattern.compile(
137-
START_MARKERS + "(?:003@|203@(?:/..+)?|107F) " + PicaConstants.SUBFIELD_MARKER + "0");
138-
159+
private static String START_MARKERS;
160+
private static Pattern ID_FIELDS_PATTERN;
139161
private static final int BUFFER_SIZE = 1024 * 1024;
140162

141-
private final Matcher idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
163+
private Matcher idFieldMatcher;
142164
private final StringBuilder idBuilder = new StringBuilder();
143165
private final PicaParserContext parserContext = new PicaParserContext();
144166

145167
private char[] buffer = new char[BUFFER_SIZE];
146168
private int recordLen;
147169

148170
private boolean ignoreMissingIdn;
171+
private boolean isNormalized;
172+
173+
public PicaDecoder() {
174+
this(true);
175+
}
176+
177+
public PicaDecoder(boolean normalized) {
178+
setNormalizedSerialization(normalized);
179+
}
149180

181+
/**
182+
* Controls whether the input is read as normalized or non-normalized
183+
* pica+. As the default "normalized" is assumed.
184+
*
185+
* @param normalized if true, the input is treated as normalized pica+ ;
186+
* if false, it's treated as non-normalized.
187+
*/
188+
public void setNormalizedSerialization(boolean normalized) {
189+
this.isNormalized = normalized;
190+
makeConstants();
191+
}
192+
193+
private void makeConstants() {
194+
START_MARKERS = "(?:^|" + PicaConstants.FIELD_MARKER.get(isNormalized) + "|"
195+
+ PicaConstants.FIELD_END_MARKER.get(isNormalized) + "|"
196+
+ PicaConstants.RECORD_MARKER.get(isNormalized) + "|.*\n" + ")";
197+
ID_FIELDS_PATTERN = Pattern
198+
.compile(START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
199+
+ " ?(\\" + PicaConstants.SUBFIELD_MARKER.get(isNormalized) + "|"
200+
+ PicaConstants.SUBFIELD_MARKER.get(isNormalized) + ")0");
201+
idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
202+
}
150203
/**
151204
* Controls whether records having no record id are reported as faulty. By
152205
* default such records are reported by the {@code PicaDecoder} by throwing
@@ -250,7 +303,7 @@ public void process(final String record) {
250303

251304
PicaParserState state = PicaParserState.FIELD_NAME;
252305
for (int i = 0; i < recordLen; ++i) {
253-
state = state.parseChar(buffer[i], parserContext);
306+
state = state.parseChar(buffer[i], parserContext, isNormalized);
254307
}
255308
state.endOfInput(parserContext);
256309

@@ -284,7 +337,7 @@ private String extractRecordId() {
284337
idBuilder.setLength(0);
285338
for (int i = idFromIndex; i < recordLen; ++i) {
286339
final char ch = buffer[i];
287-
if (isSubfieldDelimiter(ch)) {
340+
if (isMarker(ch)) {
288341
break;
289342
}
290343
idBuilder.append(ch);
@@ -300,11 +353,8 @@ private int findRecordId() {
300353
return idFieldMatcher.end();
301354
}
302355

303-
private static boolean isSubfieldDelimiter(final char ch) {
304-
return ch == PicaConstants.RECORD_MARKER
305-
|| ch == PicaConstants.FIELD_MARKER
306-
|| ch == PicaConstants.FIELD_END_MARKER
307-
|| ch == PicaConstants.SUBFIELD_MARKER;
356+
private boolean isMarker(final char ch) {
357+
return PicaConstants.from(isNormalized, ch) != PicaConstants.NO_MARKER;
308358
}
309359

310360
}

metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaParserState.java

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2016 Christoph Böhme
2+
* Copyright 2016,2019 Christoph Böhme and others
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -30,23 +30,24 @@
3030
* skips unnamed fields without any subfields.
3131
*
3232
* @author Christoph Böhme
33-
*
33+
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
34+
* @author Fabian Steeg (fsteeg) (switch to enum)
3435
*/
3536
enum PicaParserState {
3637

3738
FIELD_NAME {
3839
@Override
39-
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
40+
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
4041
final PicaParserState next;
41-
switch (ch) {
42-
case PicaConstants.RECORD_MARKER:
43-
case PicaConstants.FIELD_MARKER:
44-
case PicaConstants.FIELD_END_MARKER:
42+
switch (PicaConstants.from(normalized, ch)) {
43+
case RECORD_MARKER:
44+
case FIELD_MARKER:
45+
case FIELD_END_MARKER:
4546
ctx.emitStartEntity();
4647
ctx.emitEndEntity();
4748
next = FIELD_NAME;
4849
break;
49-
case PicaConstants.SUBFIELD_MARKER:
50+
case SUBFIELD_MARKER:
5051
ctx.emitStartEntity();
5152
next = SUBFIELD_NAME;
5253
break;
@@ -65,16 +66,16 @@ protected void endOfInput(final PicaParserContext ctx) {
6566
},
6667
SUBFIELD_NAME {
6768
@Override
68-
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
69+
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
6970
final PicaParserState next;
70-
switch (ch) {
71-
case PicaConstants.RECORD_MARKER:
72-
case PicaConstants.FIELD_MARKER:
73-
case PicaConstants.FIELD_END_MARKER:
71+
switch (PicaConstants.from(normalized, ch)) {
72+
case RECORD_MARKER:
73+
case FIELD_MARKER:
74+
case FIELD_END_MARKER:
7475
ctx.emitEndEntity();
7576
next = FIELD_NAME;
7677
break;
77-
case PicaConstants.SUBFIELD_MARKER:
78+
case SUBFIELD_MARKER:
7879
next = this;
7980
break;
8081
default:
@@ -91,17 +92,17 @@ protected void endOfInput(final PicaParserContext ctx) {
9192
},
9293
SUBFIELD_VALUE {
9394
@Override
94-
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
95+
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
9596
final PicaParserState next;
96-
switch (ch) {
97-
case PicaConstants.RECORD_MARKER:
98-
case PicaConstants.FIELD_MARKER:
99-
case PicaConstants.FIELD_END_MARKER:
97+
switch (PicaConstants.from(normalized, ch)) {
98+
case RECORD_MARKER:
99+
case FIELD_MARKER:
100+
case FIELD_END_MARKER:
100101
ctx.emitLiteral();
101102
ctx.emitEndEntity();
102103
next = FIELD_NAME;
103104
break;
104-
case PicaConstants.SUBFIELD_MARKER:
105+
case SUBFIELD_MARKER:
105106
ctx.emitLiteral();
106107
next = SUBFIELD_NAME;
107108
break;
@@ -119,7 +120,7 @@ protected void endOfInput(final PicaParserContext ctx) {
119120
}
120121
};
121122

122-
protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
123+
protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx, final boolean normalized);
123124

124125
protected abstract void endOfInput(final PicaParserContext ctx);
125126

0 commit comments

Comments
 (0)