Skip to content

Commit e3d9674

Browse files
committed
Implement non-normalized PICA+ serialization
This should be backwards compatible as the default is still the normalized serialization. - add LineRecorder to split non-normalized serialized PICA+ records - add testing of LineRecorder - add flux example - add flux command See #296.
1 parent 1f3f5a5 commit e3d9674

File tree

9 files changed

+1246
-53
lines changed

9 files changed

+1246
-53
lines changed

metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaConstants.java

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
/*
2-
* Copyright 2016 Christoph Böhme
1+
/* Copyright 2016,2019 Christoph Böhme and hbz
32
*
43
* Licensed under the Apache License, Version 2.0 the "License";
54
* you may not use this file except in compliance with the License.
@@ -13,20 +12,38 @@
1312
* See the License for the specific language governing permissions and
1413
* limitations under the License.
1514
*/
16-
package org.metafacture.biblio.pica;
1715

1816
/**
19-
* Useful constants for PICA+
17+
* Useful constants for PICA+.
18+
* PICA+ comes with two possible serializations:
19+
* a normalized one and a non-normalized.
2020
*
2121
* @author Christoph Böhme
22+
* @author Pascal Christoph (dr0i)
2223
*
2324
*/
24-
final class PicaConstants {
2525

26-
public static final char RECORD_MARKER = '\u001d';
27-
public static final char FIELD_MARKER = '\u001e';
28-
public static final char SUBFIELD_MARKER = '\u001f';
29-
public static final char FIELD_END_MARKER = '\n';
26+
package org.metafacture.biblio.pica;
27+
28+
final class PicaConstants{
29+
public static char RECORD_MARKER = '\u001d';
30+
public static char FIELD_MARKER = '\u001e';
31+
public static char SUBFIELD_MARKER = '\u001f';
32+
public static char FIELD_END_MARKER = '\n';
33+
34+
public static void setNormalizedSerialization() {
35+
RECORD_MARKER = '\u001d';
36+
FIELD_MARKER = '\u001e';
37+
SUBFIELD_MARKER = '\u001f';
38+
FIELD_END_MARKER = '\n';
39+
}
40+
41+
public static void setNonNormalizedSerialization() {
42+
RECORD_MARKER = '\n';
43+
FIELD_MARKER = '\n'; //this is a dummy
44+
SUBFIELD_MARKER = '$';
45+
FIELD_END_MARKER = '\n';
46+
}
3047

3148
private PicaConstants() {
3249
// No instances allowed

metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaDecoder.java

Lines changed: 66 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2016 Christoph Böhme
2+
* Copyright 2016, 2019 Christoph Böhme and hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -32,9 +32,11 @@
3232
* containing multiple records must be split into individual records before
3333
* passing it to {@code PicaDecoder}.
3434
* <p>
35-
* The parser is designed to accept any string as valid input and to parse pica
36-
* plain format as well as normalised pica. To achieve this, the parser behaves
37-
* as following:
35+
* The parser is designed to accept any string as valid input and to parse
36+
* pica+ in its two serialization forms:
37+
* as non-normalized and as normalized.
38+
* To achieve this, the parser behaves as following when parsing:
39+
* normalized pica+:
3840
* <ul>
3941
* <li>The parser assumes that the input starts with a field name.
4042
*
@@ -56,6 +58,26 @@
5658
* that field names, subfields, subfield names or subfield values can be
5759
* empty.
5860
* </ul>
61+
* * non-normalized pica+:
62+
* <ul>
63+
* <li>The parser assumes that the input starts with a field name.
64+
*
65+
* <li>The field name and the first subfield are separated by a subfield
66+
* marker ($).
67+
*
68+
* <li>Fields are separated by record markers (&#92;n) or field end
69+
* markers (&#92;n).
70+
*
71+
* <li>Subfields are separated by subfield markers ($).
72+
*
73+
* <li>The first character of a subfield is the name of the subfield
74+
*
75+
* <li>The parser assumes that the end of the input marks the end of the
76+
* current field and the end of the record.
77+
*
78+
* <li>As multiple fields and subfields are not empty in non-normailzed pica+
79+
* they are just treated like anything else.
80+
* </ul>
5981
* Please note that the record marker is treated as a field delimiter and not
6082
* as a record delimiter. Records need to be separated prior to parsing them.
6183
* <p>
@@ -69,7 +91,8 @@
6991
*
7092
* <li>Subfields which only have a name but no value are always parsed.
7193
*
72-
* <li>Unnamed fields are only parsed if the contain not-ignored subfields.
94+
* <li>In normalized pica+ unnamed fields are only parsed if they contain
95+
* not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
7396
*
7497
* <li>Named fields containing none or only ignored subfields are only parsed
7598
* if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
@@ -85,7 +108,7 @@
85108
* {@link #setTrimFieldNames(boolean)} to false.
86109
* <p>
87110
* The record id emitted with the <i>start-record</i> event is extracted from
88-
* one of the following pica fields:
111+
* one of the following non-normalized pica+ fields:
89112
* <ul>
90113
* <li><i>003&#64; $0</i>
91114
* <li><i>107F $0</i>
@@ -97,7 +120,7 @@
97120
* found in the record a {@link MissingIdException} is thrown otherwise the
98121
* record identifier is an empty string.
99122
* <p>
100-
* For example, when run on the input
123+
* For example, when run on this input in its normalized serialization form:
101124
* <pre>
102125
* 003&#64; &#92;u001f01234&#92;u001e
103126
* 028A &#92;u001faAndy&#92;u001fdWarhol&#92;u001e
@@ -120,6 +143,7 @@
120143
* support other pica encodings.
121144
*
122145
* @author Christoph Böhme
146+
* @author Pascal Christoph (dr0i)
123147
*
124148
*/
125149
@Description("Parses pica+ records. The parser only parses single records. " +
@@ -131,14 +155,11 @@
131155
public final class PicaDecoder
132156
extends DefaultObjectPipe<String, StreamReceiver> {
133157

134-
private static final String START_MARKERS ="(?:^|" + PicaConstants.FIELD_MARKER +
135-
"|" + PicaConstants.FIELD_END_MARKER + "|" + PicaConstants.RECORD_MARKER + ")";
136-
private static final Pattern ID_FIELDS_PATTERN = Pattern.compile(
137-
START_MARKERS + "(?:003@|203@(?:/..+)?|107F) " + PicaConstants.SUBFIELD_MARKER + "0");
138-
158+
private static String START_MARKERS;
159+
private static Pattern ID_FIELDS_PATTERN;
139160
private static final int BUFFER_SIZE = 1024 * 1024;
140161

141-
private final Matcher idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
162+
private Matcher idFieldMatcher;
142163
private final StringBuilder idBuilder = new StringBuilder();
143164
private final PicaParserContext parserContext = new PicaParserContext();
144165

@@ -147,6 +168,38 @@ public final class PicaDecoder
147168

148169
private boolean ignoreMissingIdn;
149170

171+
public PicaDecoder() {
172+
makeConstants();
173+
}
174+
175+
public PicaDecoder(boolean normalized) {
176+
setNormalizedSerialization(normalized);
177+
makeConstants();
178+
}
179+
/**
180+
* Controls wether the input is serialzed as normalized or non-normalized
181+
* pica+. As the default "normalized" is assumed.
182+
*
183+
* @param normalized if true, the input is treated as "normalized" pica+ ;
184+
* if false, it's treated as non-normalized serialized.
185+
*/
186+
public void setNormalizedSerialization(boolean normalized) {
187+
if (normalized)
188+
PicaConstants.setNormalizedSerialization();
189+
else
190+
PicaConstants.setNonNormalizedSerialization();
191+
makeConstants();
192+
}
193+
private void makeConstants() {
194+
START_MARKERS = "(?:^|" + PicaConstants.FIELD_MARKER + "|"
195+
+ PicaConstants.FIELD_END_MARKER + "|"
196+
+ PicaConstants.RECORD_MARKER + "|.*\n" + ")";
197+
ID_FIELDS_PATTERN = Pattern
198+
.compile(START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
199+
+ " ?(\\" + PicaConstants.SUBFIELD_MARKER + "|"
200+
+ PicaConstants.SUBFIELD_MARKER + ")0");
201+
idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
202+
}
150203
/**
151204
* Controls whether records having no record id are reported as faulty. By
152205
* default such records are reported by the {@code PicaDecoder} by throwing

metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaParserState.java

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2016 Christoph Böhme
2+
* Copyright 2016,2019 Christoph Böhme and hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -25,32 +25,30 @@
2525
* The parser ignores spaces in field names. They are not included in the
2626
* field name.
2727
*
28-
* Empty subfields are skipped. For instance, parsing the following input
29-
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
28+
* Empty subfields are skipped. For instance, parsing the following normalized
29+
* pica+ would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
3030
* skips unnamed fields without any subfields.
3131
*
3232
* @author Christoph Böhme
33-
*
33+
* @author Pascal Christoph (dr0i)
34+
*
3435
*/
3536
enum PicaParserState {
3637

3738
FIELD_NAME {
3839
@Override
3940
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
4041
final PicaParserState next;
41-
switch (ch) {
42-
case PicaConstants.RECORD_MARKER:
43-
case PicaConstants.FIELD_MARKER:
44-
case PicaConstants.FIELD_END_MARKER:
42+
if(ch==PicaConstants.RECORD_MARKER ||
43+
ch==PicaConstants.FIELD_MARKER ||
44+
ch==PicaConstants.FIELD_END_MARKER){
4545
ctx.emitStartEntity();
4646
ctx.emitEndEntity();
4747
next = FIELD_NAME;
48-
break;
49-
case PicaConstants.SUBFIELD_MARKER:
48+
}else if(ch==PicaConstants.SUBFIELD_MARKER){
5049
ctx.emitStartEntity();
5150
next = SUBFIELD_NAME;
52-
break;
53-
default:
51+
}else{
5452
ctx.appendText(ch);
5553
next = this;
5654
}
@@ -67,17 +65,14 @@ protected void endOfInput(final PicaParserContext ctx) {
6765
@Override
6866
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
6967
final PicaParserState next;
70-
switch (ch) {
71-
case PicaConstants.RECORD_MARKER:
72-
case PicaConstants.FIELD_MARKER:
73-
case PicaConstants.FIELD_END_MARKER:
68+
if(ch==PicaConstants.RECORD_MARKER ||
69+
ch==PicaConstants.FIELD_MARKER ||
70+
ch==PicaConstants.FIELD_END_MARKER){
7471
ctx.emitEndEntity();
7572
next = FIELD_NAME;
76-
break;
77-
case PicaConstants.SUBFIELD_MARKER:
73+
}else if(ch==PicaConstants.SUBFIELD_MARKER)
7874
next = this;
79-
break;
80-
default:
75+
else{
8176
ctx.setSubfieldName(ch);
8277
next = SUBFIELD_VALUE;
8378
}
@@ -93,19 +88,16 @@ protected void endOfInput(final PicaParserContext ctx) {
9388
@Override
9489
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
9590
final PicaParserState next;
96-
switch (ch) {
97-
case PicaConstants.RECORD_MARKER:
98-
case PicaConstants.FIELD_MARKER:
99-
case PicaConstants.FIELD_END_MARKER:
91+
if(ch==PicaConstants.RECORD_MARKER ||
92+
ch==PicaConstants.FIELD_MARKER ||
93+
ch==PicaConstants.FIELD_END_MARKER){
10094
ctx.emitLiteral();
10195
ctx.emitEndEntity();
10296
next = FIELD_NAME;
103-
break;
104-
case PicaConstants.SUBFIELD_MARKER:
97+
}else if(ch==PicaConstants.SUBFIELD_MARKER){
10598
ctx.emitLiteral();
10699
next = SUBFIELD_NAME;
107-
break;
108-
default:
100+
}else{
109101
ctx.appendText(ch);
110102
next = this;
111103
}
@@ -122,5 +114,4 @@ protected void endOfInput(final PicaParserContext ctx) {
122114
protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
123115

124116
protected abstract void endOfInput(final PicaParserContext ctx);
125-
126117
}

metafacture-biblio/src/test/java/org/metafacture/biblio/pica/PicaDecoderTest.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2016 Christoph Böhme
2+
* Copyright 2016-2019 Christoph Böhme and hbz
33
*
44
* Licensed under the Apache License, Version 2.0 the "License";
55
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
1617
package org.metafacture.biblio.pica;
1718

1819
import static org.mockito.Mockito.inOrder;
@@ -32,6 +33,7 @@
3233
* Tests for class {@link PicaDecoder}.
3334
*
3435
* @author Christoph Böhme
36+
* @author Pascal Christoph (dr0i)
3537
*
3638
*/
3739
public final class PicaDecoderTest {
@@ -50,6 +52,10 @@ public final class PicaDecoderTest {
5052
private static final String SUBFIELD_MARKER = "\u001f";
5153
private static final String FIELD_END_MARKER = "\n";
5254

55+
private static final String NONNORMALIZED_RECORD_MARKER = "\n";
56+
private static final String NONNORMALIZED_SUBFIELD_MARKER = "$";
57+
private static final String NONNORMALIZED_FIELD_END_MARKER = "\n";
58+
5359
private static final String FIELD_001AT_0_TEST = "001@ " + SUBFIELD_MARKER + "0test";
5460
private static final String FIELD_003AT_0_ID = "003@ " + SUBFIELD_MARKER + "0" + RECORD_ID;
5561
private static final String FIELD_107F_0_ID = "107F " + SUBFIELD_MARKER + "0" + RECORD_ID;
@@ -59,6 +65,9 @@ public final class PicaDecoderTest {
5965
private static final String FIELD_021A_A_UEBER = "021A " + SUBFIELD_MARKER + "a" + COMPOSED_UTF8;
6066
private static final String FIELD_028A = ENTITY_028A + " ";
6167

68+
private static final String NONNORMALIZED_FIELD_001AT_0_TEST = "001@ " + NONNORMALIZED_SUBFIELD_MARKER + "0test";
69+
private static final String NONNORMALIZED_FIELD_003AT_0_ID = "003@ " + NONNORMALIZED_SUBFIELD_MARKER + "0" + RECORD_ID;
70+
6271
private PicaDecoder picaDecoder;
6372

6473
@Mock
@@ -562,6 +571,24 @@ public void shouldNotTrimWhitespaceInFieldNamesIfConfigured() {
562571
verify(receiver).startEntity(" fieldname ");
563572
}
564573

574+
@Test
575+
public void nonNormalizedPica() {
576+
picaDecoder.setNormalizedSerialization(false);
577+
picaDecoder.process(
578+
NONNORMALIZED_FIELD_001AT_0_TEST +
579+
NONNORMALIZED_FIELD_END_MARKER +
580+
NONNORMALIZED_FIELD_003AT_0_ID +
581+
NONNORMALIZED_RECORD_MARKER);
582+
try {
583+
verify(receiver).startEntity("001@");
584+
verify(receiver).literal("0", "test");
585+
verify(receiver).startEntity("003@");
586+
verify(receiver).literal("0", "2809");
587+
} finally { //ensure reset to the default used by the other tests
588+
picaDecoder.setNormalizedSerialization(true);
589+
}
590+
}
591+
565592
private void verify003At0ID(final InOrder ordered) {
566593
ordered.verify(receiver).startEntity("003@");
567594
ordered.verify(receiver).literal("0", RECORD_ID);

0 commit comments

Comments
 (0)