11/*
2- * Copyright 2016 Christoph Böhme
2+ * Copyright 2016, 2019 Christoph Böhme and others
33 *
44 * Licensed under the Apache License, Version 2.0 the "License";
55 * you may not use this file except in compliance with the License.
3232 * containing multiple records must be split into individual records before
3333 * passing it to {@code PicaDecoder}.
3434 * <p>
35- * The parser is designed to accept any string as valid input and to parse pica
36- * plain format as well as normalised pica. To achieve this, the parser behaves
37- * as following:
35+ * The parser is designed to accept any string as valid input and to parse
36+ * pica+ in its two serialization forms:
37+ * as non-normalized and as normalized.
38+ * To achieve this, the parser behaves as following when parsing:
39+ * normalized pica+:
3840 * <ul>
3941 * <li>The parser assumes that the input starts with a field name.
4042 *
5658 * that field names, subfields, subfield names or subfield values can be
5759 * empty.
5860 * </ul>
61+ * * non-normalized pica+:
62+ * <ul>
63+ * <li>The parser assumes that the input starts with a field name.
64+ *
65+ * <li>The field name and the first subfield are separated by a subfield
66+ * marker ($).
67+ *
68+ * <li>Fields are separated by record markers (\n) or field end
69+ * markers (\n).
70+ *
71+ * <li>Subfields are separated by subfield markers ($).
72+ *
73+ * <li>The first character of a subfield is the name of the subfield
74+ *
75+ * <li>The parser assumes that the end of the input marks the end of the
76+ * current field and the end of the record.
77+ *
78+ * <li>As multiple fields and subfields are not empty in non-normailzed pica+
79+ * they are just treated like anything else.
80+ * </ul>
5981 * Please note that the record marker is treated as a field delimiter and not
6082 * as a record delimiter. Records need to be separated prior to parsing them.
6183 * <p>
6991 *
7092 * <li>Subfields which only have a name but no value are always parsed.
7193 *
72- * <li>Unnamed fields are only parsed if the contain not-ignored subfields.
94+ * <li>In normalized pica+ unnamed fields are only parsed if they contain
95+ * not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
7396 *
7497 * <li>Named fields containing none or only ignored subfields are only parsed
7598 * if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
85108 * {@link #setTrimFieldNames(boolean)} to false.
86109 * <p>
87110 * The record id emitted with the <i>start-record</i> event is extracted from
88- * one of the following pica fields:
111+ * one of the following non-normalized pica+ fields:
89112 * <ul>
90113 * <li><i>003@ $0</i>
91114 * <li><i>107F $0</i>
97120 * found in the record a {@link MissingIdException} is thrown otherwise the
98121 * record identifier is an empty string.
99122 * <p>
100- * For example, when run on the input
123+ * For example, when run on this input in its normalized serialization form:
101124 * <pre>
102125 * 003@ \u001f01234\u001e
103126 * 028A \u001faAndy\u001fdWarhol\u001e
120143 * support other pica encodings.
121144 *
122145 * @author Christoph Böhme
146+ * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
147+ * @author Fabian Steeg (fsteeg) (switch to enum)
123148 *
124149 */
125150@ Description ("Parses pica+ records. The parser only parses single records. " +
131156public final class PicaDecoder
132157 extends DefaultObjectPipe <String , StreamReceiver > {
133158
134- private static final String START_MARKERS ="(?:^|" + PicaConstants .FIELD_MARKER +
135- "|" + PicaConstants .FIELD_END_MARKER + "|" + PicaConstants .RECORD_MARKER + ")" ;
136- private static final Pattern ID_FIELDS_PATTERN = Pattern .compile (
137- START_MARKERS + "(?:003@|203@(?:/..+)?|107F) " + PicaConstants .SUBFIELD_MARKER + "0" );
138-
159+ private static String START_MARKERS ;
160+ private static Pattern ID_FIELDS_PATTERN ;
139161 private static final int BUFFER_SIZE = 1024 * 1024 ;
140162
141- private final Matcher idFieldMatcher = ID_FIELDS_PATTERN . matcher ( "" ) ;
163+ private Matcher idFieldMatcher ;
142164 private final StringBuilder idBuilder = new StringBuilder ();
143165 private final PicaParserContext parserContext = new PicaParserContext ();
144166
145167 private char [] buffer = new char [BUFFER_SIZE ];
146168 private int recordLen ;
147169
148170 private boolean ignoreMissingIdn ;
171+ private boolean isNormalized ;
172+
173+ public PicaDecoder () {
174+ this (true );
175+ }
176+
177+ public PicaDecoder (boolean normalized ) {
178+ setNormalizedSerialization (normalized );
179+ }
149180
181+ /**
182+ * Controls whether the input is read as normalized or non-normalized
183+ * pica+. As the default "normalized" is assumed.
184+ *
185+ * @param normalized if true, the input is treated as normalized pica+ ;
186+ * if false, it's treated as non-normalized.
187+ */
188+ public void setNormalizedSerialization (boolean normalized ) {
189+ this .isNormalized = normalized ;
190+ makeConstants ();
191+ }
192+
193+ private void makeConstants () {
194+ START_MARKERS = "(?:^|" + PicaConstants .FIELD_MARKER .get (isNormalized ) + "|"
195+ + PicaConstants .FIELD_END_MARKER .get (isNormalized ) + "|"
196+ + PicaConstants .RECORD_MARKER .get (isNormalized ) + "|.*\n " + ")" ;
197+ ID_FIELDS_PATTERN = Pattern
198+ .compile (START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
199+ + " ?(\\ " + PicaConstants .SUBFIELD_MARKER .get (isNormalized ) + "|"
200+ + PicaConstants .SUBFIELD_MARKER .get (isNormalized ) + ")0" );
201+ idFieldMatcher = ID_FIELDS_PATTERN .matcher ("" );
202+ }
150203 /**
151204 * Controls whether records having no record id are reported as faulty. By
152205 * default such records are reported by the {@code PicaDecoder} by throwing
@@ -250,7 +303,7 @@ public void process(final String record) {
250303
251304 PicaParserState state = PicaParserState .FIELD_NAME ;
252305 for (int i = 0 ; i < recordLen ; ++i ) {
253- state = state .parseChar (buffer [i ], parserContext );
306+ state = state .parseChar (buffer [i ], parserContext , isNormalized );
254307 }
255308 state .endOfInput (parserContext );
256309
@@ -284,7 +337,7 @@ private String extractRecordId() {
284337 idBuilder .setLength (0 );
285338 for (int i = idFromIndex ; i < recordLen ; ++i ) {
286339 final char ch = buffer [i ];
287- if (isSubfieldDelimiter (ch )) {
340+ if (isMarker (ch )) {
288341 break ;
289342 }
290343 idBuilder .append (ch );
@@ -300,11 +353,8 @@ private int findRecordId() {
300353 return idFieldMatcher .end ();
301354 }
302355
303- private static boolean isSubfieldDelimiter (final char ch ) {
304- return ch == PicaConstants .RECORD_MARKER
305- || ch == PicaConstants .FIELD_MARKER
306- || ch == PicaConstants .FIELD_END_MARKER
307- || ch == PicaConstants .SUBFIELD_MARKER ;
356+ private boolean isMarker (final char ch ) {
357+ return PicaConstants .from (isNormalized , ch ) != PicaConstants .NO_MARKER ;
308358 }
309359
310360}
0 commit comments