2222import org .culturegraph .mf .framework .annotations .Out ;
2323import org .culturegraph .mf .util .StringUtil ;
2424
25-
2625/**
27- * <p>Parses pica+ records. The parser only parses single records.
28- * A string containing multiple records must be split into
29- * individual records before passing it to {@code PicaDecoder}.</p>
26+ * Parses pica+ records. The parser only parses single records. A string
27+ * containing multiple records must be split into individual records before
28+ * passing it to {@code PicaDecoder}.
29+ * <p>
30+ * The parser is designed to accept any string as valid input and to parse pica
31+ * plain format as well as normalised pica. To achieve this, the parser behaves
32+ * as following:
33+ * <ul>
34+ * <li>The parser assumes that the input starts with a field name.
3035 *
31- * <p>The parser is designed to accept any string as valid input and
32- * to parse pica plain format as well as normalised pica. To
33- * achieve this, the parser behaves as following:</p>
36+ * <li>The field name and the first subfield are separated by a subfield
37+ * marker (\u001f).
3438 *
35- * <ul>
36- * <li>Fields are separated by record markers (0x1d), field
37- * markers (0x1e) or field end markers (0x0a).</li>
38- * <li>The field name and the first subfield are separated by
39- * a subfield marker (0x01f).</li>
40- * <li>The parser assumes that the input starts with a field
41- * name.</li>
42- * <li>The parser assumes that the end of the input marks
43- * the end of the current field and the end of the record.
44- * </li>
45- * <li>Subfields are separated by subfield markers (0x1f).</li>
46- * <li>The first character of a subfield is the name of the
47- * subfield</li>
48- * <li>To handle input with multiple field and subfield separators
49- * following each other directly (for instance 0x0a and 0x1e), it
50- * is assumed that field names, subfields, subfield names or
51- * subfield values can be empty.</li>
52- * </ul>
39+ * <li>Fields are separated by record markers (\u001d), field
40+ * markers (\u001e) or field end markers (\u000a).
5341 *
54- * <p>Please note that the record markers is treated as a field
55- * delimiter and not as a record delimiter. Records need to be
56- * separated prior to parsing them.</p>
42+ * <li>Subfields are separated by subfield markers (\u001f).
5743 *
58- * <p>As the behaviour of the parser may result in unnamed fields or
59- * subfields or fields with no subfields the {@code PicaDecoder}
60- * automatically filters empty fields and subfields:</p>
44+ * <li>The first character of a subfield is the name of the subfield
6145 *
62- * <ul>
63- * <li>Subfields without a name are ignored (such fields cannot
64- * have any value because then the first character of the value
65- * would be the field name).</li>
66- * <li>Subfields which only have a name but no value are always
67- * parsed.</li>
68- * <li>Unnamed Fields are only parsed if the contain not-ignored
69- * subfields.</li>
70- * <li>Named fields containing none or only ignored subfields are
71- * only parsed if {@code skipEmptyFields} is set to {@code false}
72- * otherwise they are ignored.</li>
73- * <li>Input containing only whitespace (spaces and tabs) is
74- * completely ignored</li>
46+ * <li>The parser assumes that the end of the input marks the end of the
47+ * current field and the end of the record.
48+ *
49+ * <li>To handle input with multiple field and subfield separators following
50+ * each other directly (for instance \u000a and \u001e), it is assumed
51+ * that field names, subfields, subfield names or subfield values can be
52+ * empty.
7553 * </ul>
54+ * Please note that the record marker is treated as a field delimiter and not
55+ * as a record delimiter. Records need to be separated prior to parsing them.
56+ * <p>
57+ * As the behaviour of the parser may result in unnamed fields or subfields or
58+ * fields with no subfields the {@code PicaDecoder} automatically filters empty
59+ * fields and subfields:
60+ * <ul>
61+ * <li>Subfields without a name are ignored (such subfields cannot have any
62+ * value because then the first character of the value would be the name of
63+ * the subfield).
7664 *
77- * <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
78- * {@code receiver.endEntity} for each parsed field and
79- * {@code receiver.literal} for each parsed subfield. Spaces in the
80- * field name are not included in the entity name. The input
81- * "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
82- * sequence of calls:</p>
65+ * <li>Subfields which only have a name but no value are always parsed.
8366 *
84- * <ol>
85- * <li>receiver.startEntity("028A")</li>
86- * <li>receiver.literal("a", "Andy")</li>
87- * <li>receiver.literal("d", "Warhol")</li>
88- * <li>receiver.endEntity()</li>
89- * </ol>
67+ * <li>Unnamed fields are only parsed if the contain not-ignored subfields.
9068 *
91- * <p>The content of subfield 003@$0 is used for the record id. If
92- * {@code ignoreMissingIdn } is false and field 003@$0 is not found
93- * in the record a {@link MissingIdException} is thrown.</p>
69+ * <li>Named fields containing none or only ignored subfields are only parsed
70+ * if {@link #setSkipEmptyFields(boolean) } is set to false otherwise they are
71+ * ignored.
9472 *
95- * <p>The parser assumes that the input is utf-8 encoded. The parser
96- * does not support other pica encodings.</p>
73+ * <li>Input containing only whitespace (spaces and tabs) is completely
74+ * ignored.
75+ * </ul>
76+ * The {@code PicaDecoder} emits <i>start-entity</i> and <i>end-entity</i>
77+ * events for each parsed field and <i>literal</i> events for each parsed
78+ * subfield. Field names are trimmed by default (leading and trailing whitespace
79+ * is removed). This can be changed by setting
80+ * {@link #setTrimFieldNames(boolean)} to false.
81+ * <p>
82+ * The content of subfield <i>003@ $0</i> is used as record id. If
83+ * {@link #setIgnoreMissingIdn(boolean)} is false and field
84+ * <i>003@ $0</i> is not found in the record a
85+ * {@link MissingIdException} is thrown otherwise the record identifier is an
86+ * empty string.
87+ * <p>
88+ * For example, when run on the input
89+ * <pre>
90+ * 003@ \u001f01234\u001e
91+ * 028A \u001faAndy\u001fdWarhol\u001e
92+ * </pre>
93+ *
94+ * the {@code PicaDecoder} will produce the following sequence of events:
95+ * <pre>{@literal
96+ * start-record "1234"
97+ * start-entity "003@"
98+ * literal "0": 1234
99+ * end-entity
100+ * start-entity "028A"
101+ * literal "a": Andy
102+ * literal "d": Warhol
103+ * end-entity
104+ * end-record
105+ * }</pre>
106+ *
107+ * The parser assumes that the input is utf-8 encoded. The parser does not
108+ * support other pica encodings.
97109 *
98110 * @author Christoph Böhme
99111 *
@@ -118,6 +130,21 @@ public final class PicaDecoder
118130
119131 private boolean ignoreMissingIdn ;
120132
133+ /**
134+ * Controls whether records having no pica subfield <i>003@ $0</i>
135+ * (which contains the record identifier <i>IDN</i>) are reported as faulty.
136+ * By default such records are reported by the {@code PicaDecoder} by throwing
137+ * a {@link MissingIdException}.
138+ * <p>
139+ * The setting can be changed at any time. It becomes effective with the next
140+ * record that is being processed.
141+ * <p>
142+ * <strong>Default value: {@code false}</strong>
143+ *
144+ * @param ignoreMissingIdn if true, missing IDNs do not trigger a
145+ * {@link MissingIdException} but an empty string is
146+ * used as record identifier instead.
147+ */
121148 public void setIgnoreMissingIdn (final boolean ignoreMissingIdn ) {
122149 this .ignoreMissingIdn = ignoreMissingIdn ;
123150 }
@@ -126,6 +153,20 @@ public boolean getIgnoreMissingIdn() {
126153 return ignoreMissingIdn ;
127154 }
128155
156+ /**
157+ * Controls whether decomposed unicode characters in field values are
158+ * normalised to their precomposed version. By default no normalisation is
159+ * applied. The normalisation is only applied to values not to field or
160+ * subfield names.
161+ * <p>
162+ * The setting can be changed at any time. It becomes effective with the next
163+ * record that is being processed.
164+ * <p>
165+ * <strong>Default value: {@code false}</strong>
166+ *
167+ * @param normalizeUTF8 if true, decomposed unicode characters in values are
168+ * normalised to their precomposed version.
169+ */
129170 public void setNormalizeUTF8 (final boolean normalizeUTF8 ) {
130171 parserContext .setNormalizeUTF8 (normalizeUTF8 );
131172 }
@@ -134,6 +175,17 @@ public boolean getNormalizeUTF8() {
134175 return parserContext .getNormalizeUTF8 ();
135176 }
136177
178+ /**
179+ * Controls whether fields without subfields are skipped and no events are
180+ * emitted for them. By default empty fields are skipped.
181+ * <p>
182+ * The setting can be changed at any time. It becomes effective with the next
183+ * record that is being processed.
184+ * <p>
185+ * <strong>Default value: {@code true}</strong>
186+ *
187+ * @param skipEmptyFields if true, then empty fields are skipped.
188+ */
137189 public void setSkipEmptyFields (final boolean skipEmptyFields ) {
138190 parserContext .setSkipEmptyFields (skipEmptyFields );
139191 }
@@ -142,6 +194,24 @@ public boolean getSkipEmptyFields() {
142194 return parserContext .getSkipEmptyFields ();
143195 }
144196
197+ /**
198+ * Sets whether field names are trimmed (removal of leading and trailing
199+ * whitespace). By default field names are trimmed.
200+ * <p>
201+ * The setting can be changed at any time. It becomes effective with the next
202+ * record that is being processed.
203+ * <p>
204+ * <strong>Default value: {@code true}</strong>
205+ *
206+ * @param trimFieldNames if true, then field names are trimmed.
207+ */
208+ public void setTrimFieldNames (final boolean trimFieldNames ) {
209+ parserContext .setTrimFieldNames (trimFieldNames );
210+ }
211+
212+ public boolean getTrimFieldNames () {
213+ return parserContext .getTrimFieldNames ();
214+ }
145215 @ Override
146216 public void process (final String record ) {
147217 assert !isClosed ();
0 commit comments