23
23
24
24
25
25
/**
26
- * Parses a PICA+ record with UTF8 encoding assumed.
26
+ * <p>Parses pica+ records. The parser only parses single records.
27
+ * A string containing multiple records must be split into
28
+ * individual records before passing it to {@code PicaDecoder}.</p>
29
+ *
30
+ * <p>The parser is designed to accept any string as valid input and
31
+ * to parse pica plain format as well as normalised pica. To
32
+ * achieve this, the parser behaves as following:</p>
33
+ *
34
+ * <ul>
35
+ * <li>Fields are separated by record markers (0x1d), field
36
+ * markers (0x1e) or field end markers (0x0a).</li>
37
+ * <li>The field name and the first subfield are separated by
38
+ * a subfield marker (0x01f).</li>
39
+ * <li>The parser assumes that the input starts with a field
40
+ * name.</li>
41
+ * <li>The parser assumes that the end of the input marks
42
+ * the end of the current field and the end of the record.
43
+ * </li>
44
+ * <li>Subfields are separated by subfield markers (0x1f).</li>
45
+ * <li>The first character of a subfield is the name of the
46
+ * subfield</li>
47
+ * <li>To handle input with multiple field and subfield separators
48
+ * following each other directly (for instance 0x0a and 0x1e), it
49
+ * is assumed that field names, subfields, subfield names or
50
+ * subfield values can be empty.</li>
51
+ * </ul>
52
+ *
53
+ * <p>Please not that the record markers is treated as a field
54
+ * delimiter and not as a record delimiter. Records need to be
55
+ * separated prior to parsing them.</p>
56
+ *
57
+ * <p>As the behaviour of the parser may result in unnamed fields or
58
+ * subfields or fields with no subfields the {@code PicaDecoder}
59
+ * automatically filters empty fields and subfields:</p>
60
+ *
61
+ * <ul>
62
+ * <li>Subfields without a name are ignored (such fields cannot
63
+ * have any value because then the first character of the value
64
+ * would be the field name).</li>
65
+ * <li>Subfields which only have a name but no value are always
66
+ * parsed.</li>
67
+ * <li>Unnamed Fields are only parsed if the contain not-ignored
68
+ * subfields.</li>
69
+ * <li>Named fields containing none or only ignored subfields are
70
+ * only parsed if {@code skipEmptyFields} is set to {@code false}
71
+ * otherwise they are ignored.</li>
72
+ * <li>Input containing only whitespace (spaces and tabs) is
73
+ * completely ignored</li>
74
+ * </ul>
75
+ *
76
+ * <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
77
+ * {@code receiver.endEntity} for each parsed field and
78
+ * {@code receiver.literal} for each parsed subfield. Spaces in the
79
+ * field name are not included in the entity name. The input
80
+ * "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
81
+ * sequence of calls:</p>
27
82
*
28
- * For each field in the stream the module calls:
29
83
* <ol>
30
- * <li>receiver.startEntity</li>
31
- * <li>receiver.literal for each subfield of the field</li>
32
- * <li>receiver.endEntity</li>
84
+ * <li>receiver.startEntity("028A")</li>
85
+ * <li>receiver.literal("a", "Andy")</li>
86
+ * <li>receiver.literal("d", "Warhol")</li>
87
+ * <li>receiver.endEntity()</li>
33
88
* </ol>
34
89
*
35
- * Spaces in the field name are not included in the entity name.
36
- *
37
- * Empty subfields are skipped. For instance, processing the following input
38
- * would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
39
- * skips unnamed fields without any subfields.
90
+ * <p>The content of subfield 003@$0 is used for the record id. If
91
+ * {@code ignoreMissingIdn} is false and field 003@$0 is not found
92
+ * in the record a {@link MissingIdException} is thrown.</p>
40
93
*
41
- * If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the
42
- * record a {@link MissingIdException} is thrown.
94
+ * <p>The parser assumes that the input is utf-8 encoded. The parser
95
+ * does not support other pica encodings.</p>
43
96
*
44
97
* @author Christoph Böhme
45
98
*
46
99
*/
47
- @ Description ("Parses a PICA+ record with UTF8 encoding assumed." )
100
+ @ Description ("Parses pica+ records. The parser only parses single records. " +
101
+ "A string containing multiple records must be split into " +
102
+ "individual records before passing it to PicaDecoder." )
48
103
@ In (String .class )
49
104
@ Out (StreamReceiver .class )
50
105
public final class PicaDecoder
51
106
extends DefaultObjectPipe <String , StreamReceiver > {
52
107
53
- private static final char [] ID_FIELD = {'0' , '0' , '3' , '@' , ' ' , PicaConstants .SUBFIELD_DELIMITER , '0' };
108
+ private static final char [] ID_FIELD = {'0' , '0' , '3' , '@' , ' ' , PicaConstants .SUBFIELD_MARKER , '0' };
54
109
55
110
private static final int BUFFER_SIZE = 1024 * 1024 ;
56
111
@@ -144,10 +199,9 @@ private boolean recordIsEmpty() {
144
199
/**
145
200
* Searches the record for the sequence specified in {@code ID_FIELD}
146
201
* and returns all characters following this sequence until the next
147
- * {@link PicaConstants.FIELD_DELIMITER},
148
- * {@link PicaConstants.SUBFIELD_DELIMITER} or the end of the record
149
- * is reached. Only the first occurrence of the sequence is processed,
150
- * later occurrences are ignored.
202
+ * control character (see {@link PicaConstants}) is found or the end of
203
+ * the record is reached. Only the first occurrence of the sequence is
204
+ * processed, later occurrences are ignored.
151
205
*
152
206
* If the sequence is not found in the string or if it is not followed
153
207
* by any characters then {@code null} is returned.
@@ -161,7 +215,7 @@ private String extractRecordId() {
161
215
int fieldPos = 0 ;
162
216
boolean skip = false ;
163
217
for (int i = 0 ; i < recordLen ; ++i ) {
164
- if (buffer [i ] == PicaConstants . FIELD_DELIMITER ) {
218
+ if (isFieldDelimiter ( buffer [i ]) ) {
165
219
if (idBuilder .length () > 0 ) {
166
220
break ;
167
221
}
@@ -176,7 +230,7 @@ private String extractRecordId() {
176
230
skip = true ;
177
231
}
178
232
} else {
179
- if (buffer [i ] == PicaConstants .SUBFIELD_DELIMITER ) {
233
+ if (buffer [i ] == PicaConstants .SUBFIELD_MARKER ) {
180
234
break ;
181
235
}
182
236
idBuilder .append (buffer [i ]);
@@ -191,4 +245,10 @@ private String extractRecordId() {
191
245
return null ;
192
246
}
193
247
248
+ private static boolean isFieldDelimiter (final char ch ) {
249
+ return ch == PicaConstants .RECORD_MARKER
250
+ || ch == PicaConstants .FIELD_MARKER
251
+ || ch == PicaConstants .FIELD_END_MARKER ;
252
+ }
253
+
194
254
}
0 commit comments