1
1
/*
2
- * Copyright 2016 Christoph Böhme
2
+ * Copyright 2016, 2019 Christoph Böhme and others
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 the "License";
5
5
* you may not use this file except in compliance with the License.
32
32
* containing multiple records must be split into individual records before
33
33
* passing it to {@code PicaDecoder}.
34
34
* <p>
35
- * The parser is designed to accept any string as valid input and to parse pica
36
- * plain format as well as normalised pica. To achieve this, the parser behaves
37
- * as following:
35
+ * The parser is designed to accept any string as valid input and to parse
36
+ * pica+ in its two serialization forms:
37
+ * as non-normalized and as normalized.
38
+ * To achieve this, the parser behaves as following when parsing:
39
+ * normalized pica+:
38
40
* <ul>
39
41
* <li>The parser assumes that the input starts with a field name.
40
42
*
56
58
* that field names, subfields, subfield names or subfield values can be
57
59
* empty.
58
60
* </ul>
61
+ * * non-normalized pica+:
62
+ * <ul>
63
+ * <li>The parser assumes that the input starts with a field name.
64
+ *
65
+ * <li>The field name and the first subfield are separated by a subfield
66
+ * marker ($).
67
+ *
68
+ * <li>Fields are separated by record markers (\n) or field end
69
+ * markers (\n).
70
+ *
71
+ * <li>Subfields are separated by subfield markers ($).
72
+ *
73
+ * <li>The first character of a subfield is the name of the subfield
74
+ *
75
+ * <li>The parser assumes that the end of the input marks the end of the
76
+ * current field and the end of the record.
77
+ *
78
+ * <li>As multiple fields and subfields are not empty in non-normailzed pica+
79
+ * they are just treated like anything else.
80
+ * </ul>
59
81
* Please note that the record marker is treated as a field delimiter and not
60
82
* as a record delimiter. Records need to be separated prior to parsing them.
61
83
* <p>
69
91
*
70
92
* <li>Subfields which only have a name but no value are always parsed.
71
93
*
72
- * <li>Unnamed fields are only parsed if the contain not-ignored subfields.
94
+ * <li>In normalized pica+ unnamed fields are only parsed if they contain
95
+ * not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
73
96
*
74
97
* <li>Named fields containing none or only ignored subfields are only parsed
75
98
* if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
85
108
* {@link #setTrimFieldNames(boolean)} to false.
86
109
* <p>
87
110
* The record id emitted with the <i>start-record</i> event is extracted from
88
- * one of the following pica fields:
111
+ * one of the following non-normalized pica+ fields:
89
112
* <ul>
90
113
* <li><i>003@ $0</i>
91
114
* <li><i>107F $0</i>
97
120
* found in the record a {@link MissingIdException} is thrown otherwise the
98
121
* record identifier is an empty string.
99
122
* <p>
100
- * For example, when run on the input
123
+ * For example, when run on this input in its normalized serialization form:
101
124
* <pre>
102
125
* 003@ \u001f01234\u001e
103
126
* 028A \u001faAndy\u001fdWarhol\u001e
120
143
* support other pica encodings.
121
144
*
122
145
* @author Christoph Böhme
146
+ * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
147
+ * @author Fabian Steeg (fsteeg) (switch to enum)
123
148
*
124
149
*/
125
150
@ Description ("Parses pica+ records. The parser only parses single records. " +
131
156
public final class PicaDecoder
132
157
extends DefaultObjectPipe <String , StreamReceiver > {
133
158
134
- private static final String START_MARKERS ="(?:^|" + PicaConstants .FIELD_MARKER +
135
- "|" + PicaConstants .FIELD_END_MARKER + "|" + PicaConstants .RECORD_MARKER + ")" ;
136
- private static final Pattern ID_FIELDS_PATTERN = Pattern .compile (
137
- START_MARKERS + "(?:003@|203@(?:/..+)?|107F) " + PicaConstants .SUBFIELD_MARKER + "0" );
138
-
159
+ private static String START_MARKERS ;
160
+ private static Pattern ID_FIELDS_PATTERN ;
139
161
private static final int BUFFER_SIZE = 1024 * 1024 ;
140
162
141
- private final Matcher idFieldMatcher = ID_FIELDS_PATTERN . matcher ( "" ) ;
163
+ private Matcher idFieldMatcher ;
142
164
private final StringBuilder idBuilder = new StringBuilder ();
143
165
private final PicaParserContext parserContext = new PicaParserContext ();
144
166
145
167
private char [] buffer = new char [BUFFER_SIZE ];
146
168
private int recordLen ;
147
169
148
170
private boolean ignoreMissingIdn ;
171
+ private boolean isNormalized ;
172
+
173
+ public PicaDecoder () {
174
+ this (true );
175
+ }
176
+
177
+ public PicaDecoder (boolean normalized ) {
178
+ setNormalizedSerialization (normalized );
179
+ }
149
180
181
+ /**
182
+ * Controls whether the input is read as normalized or non-normalized
183
+ * pica+. As the default "normalized" is assumed.
184
+ *
185
+ * @param normalized if true, the input is treated as normalized pica+ ;
186
+ * if false, it's treated as non-normalized.
187
+ */
188
+ public void setNormalizedSerialization (boolean normalized ) {
189
+ this .isNormalized = normalized ;
190
+ makeConstants ();
191
+ }
192
+
193
+ private void makeConstants () {
194
+ START_MARKERS = "(?:^|" + PicaConstants .FIELD_MARKER .get (isNormalized ) + "|"
195
+ + PicaConstants .FIELD_END_MARKER .get (isNormalized ) + "|"
196
+ + PicaConstants .RECORD_MARKER .get (isNormalized ) + "|.*\n " + ")" ;
197
+ ID_FIELDS_PATTERN = Pattern
198
+ .compile (START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
199
+ + " ?(\\ " + PicaConstants .SUBFIELD_MARKER .get (isNormalized ) + "|"
200
+ + PicaConstants .SUBFIELD_MARKER .get (isNormalized ) + ")0" );
201
+ idFieldMatcher = ID_FIELDS_PATTERN .matcher ("" );
202
+ }
150
203
/**
151
204
* Controls whether records having no record id are reported as faulty. By
152
205
* default such records are reported by the {@code PicaDecoder} by throwing
@@ -250,7 +303,7 @@ public void process(final String record) {
250
303
251
304
PicaParserState state = PicaParserState .FIELD_NAME ;
252
305
for (int i = 0 ; i < recordLen ; ++i ) {
253
- state = state .parseChar (buffer [i ], parserContext );
306
+ state = state .parseChar (buffer [i ], parserContext , isNormalized );
254
307
}
255
308
state .endOfInput (parserContext );
256
309
@@ -284,7 +337,7 @@ private String extractRecordId() {
284
337
idBuilder .setLength (0 );
285
338
for (int i = idFromIndex ; i < recordLen ; ++i ) {
286
339
final char ch = buffer [i ];
287
- if (isSubfieldDelimiter (ch )) {
340
+ if (isMarker (ch )) {
288
341
break ;
289
342
}
290
343
idBuilder .append (ch );
@@ -300,11 +353,8 @@ private int findRecordId() {
300
353
return idFieldMatcher .end ();
301
354
}
302
355
303
- private static boolean isSubfieldDelimiter (final char ch ) {
304
- return ch == PicaConstants .RECORD_MARKER
305
- || ch == PicaConstants .FIELD_MARKER
306
- || ch == PicaConstants .FIELD_END_MARKER
307
- || ch == PicaConstants .SUBFIELD_MARKER ;
356
+ private boolean isMarker (final char ch ) {
357
+ return PicaConstants .from (isNormalized , ch ) != PicaConstants .NO_MARKER ;
308
358
}
309
359
310
360
}
0 commit comments