|
23 | 23 |
|
24 | 24 |
|
25 | 25 | /**
|
26 |
| - * Parses a PICA+ record with UTF8 encoding assumed. |
| 26 | + * <p>Parses pica+ records. The parser only parses single records. |
| 27 | + * A string containing multiple records must be split into |
| 28 | + * individual records before passing it to {@code PicaDecoder}.</p> |
| 29 | + * |
| 30 | + * <p>The parser is designed to accept any string as valid input and |
| 31 | + * to parse pica plain format as well as normalised pica. To |
| 32 | + * achieve this, the parser behaves as following:</p> |
| 33 | + * |
| 34 | + * <ul> |
| 35 | + * <li>Fields are separated by record markers (0x1d), field |
| 36 | + * markers (0x1e) or field end markers (0x0a).</li> |
| 37 | + * <li>The field name and the first subfield are separated by |
| 38 | + * a subfield marker (0x01f).</li> |
| 39 | + * <li>The parser assumes that the input starts with a field |
| 40 | + * name.</li> |
| 41 | + * <li>The parser assumes that the end of the input marks |
| 42 | + * the end of the current field and the end of the record. |
| 43 | + * </li> |
| 44 | + * <li>Subfields are separated by subfield markers (0x1f).</li> |
| 45 | + * <li>The first character of a subfield is the name of the |
| 46 | + * subfield</li> |
| 47 | + * <li>To handle input with multiple field and subfield separators |
| 48 | + * following each other directly (for instance 0x0a and 0x1e), it |
| 49 | + * is assumed that field names, subfields, subfield names or |
| 50 | + * subfield values can be empty.</li> |
| 51 | + * </ul> |
| 52 | + * |
| 53 | + * <p>Please not that the record markers is treated as a field |
| 54 | + * delimiter and not as a record delimiter. Records need to be |
| 55 | + * separated prior to parsing them.</p> |
| 56 | + * |
| 57 | + * <p>As the behaviour of the parser may result in unnamed fields or |
| 58 | + * subfields or fields with no subfields the {@code PicaDecoder} |
| 59 | + * automatically filters empty fields and subfields:</p> |
| 60 | + * |
| 61 | + * <ul> |
| 62 | + * <li>Subfields without a name are ignored (such fields cannot |
| 63 | + * have any value because then the first character of the value |
| 64 | + * would be the field name).</li> |
| 65 | + * <li>Subfields which only have a name but no value are always |
| 66 | + * parsed.</li> |
| 67 | + * <li>Unnamed Fields are only parsed if the contain not-ignored |
| 68 | + * subfields.</li> |
| 69 | + * <li>Named fields containing none or only ignored subfields are |
| 70 | + * only parsed if {@code skipEmptyFields} is set to {@code false} |
| 71 | + * otherwise they are ignored.</li> |
| 72 | + * <li>Input containing only whitespace (spaces and tabs) is |
| 73 | + * completely ignored</li> |
| 74 | + * </ul> |
| 75 | + * |
| 76 | + * <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and |
| 77 | + * {@code receiver.endEntity} for each parsed field and |
| 78 | + * {@code receiver.literal} for each parsed subfield. Spaces in the |
| 79 | + * field name are not included in the entity name. The input |
| 80 | + * "028A \x1faAndy\x1fdWarhol\x1e" would produce the following |
| 81 | + * sequence of calls:</p> |
27 | 82 | *
|
28 |
| - * For each field in the stream the module calls: |
29 | 83 | * <ol>
|
30 |
| - * <li>receiver.startEntity</li> |
31 |
| - * <li>receiver.literal for each subfield of the field</li> |
32 |
| - * <li>receiver.endEntity</li> |
| 84 | + * <li>receiver.startEntity("028A")</li> |
| 85 | + * <li>receiver.literal("a", "Andy")</li> |
| 86 | + * <li>receiver.literal("d", "Warhol")</li> |
| 87 | + * <li>receiver.endEntity()</li> |
33 | 88 | * </ol>
|
34 | 89 | *
|
35 |
| - * Spaces in the field name are not included in the entity name. |
36 |
| - * |
37 |
| - * Empty subfields are skipped. For instance, processing the following input |
38 |
| - * would NOT produce an empty literal: 003@ \u001f\u001e. The parser also |
39 |
| - * skips unnamed fields without any subfields. |
| 90 | + * <p>The content of subfield 003@$0 is used for the record id. If |
| 91 | + * {@code ignoreMissingIdn} is false and field 003@$0 is not found |
| 92 | + * in the record a {@link MissingIdException} is thrown.</p> |
40 | 93 | *
|
41 |
| - * If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the |
42 |
| - * record a {@link MissingIdException} is thrown. |
| 94 | + * <p>The parser assumes that the input is utf-8 encoded. The parser |
| 95 | + * does not support other pica encodings.</p> |
43 | 96 | *
|
44 | 97 | * @author Christoph Böhme
|
45 | 98 | *
|
46 | 99 | */
|
47 |
| -@Description("Parses a PICA+ record with UTF8 encoding assumed.") |
| 100 | +@Description("Parses pica+ records. The parser only parses single records. " + |
| 101 | + "A string containing multiple records must be split into " + |
| 102 | + "individual records before passing it to PicaDecoder.") |
48 | 103 | @In(String.class)
|
49 | 104 | @Out(StreamReceiver.class)
|
50 | 105 | public final class PicaDecoder
|
@@ -144,7 +199,7 @@ private boolean recordIsEmpty() {
|
144 | 199 | /**
|
145 | 200 | * Searches the record for the sequence specified in {@code ID_FIELD}
|
146 | 201 | * and returns all characters following this sequence until the next
|
147 |
| - * control character (see {@link PicaConstants} is found or the end of |
| 202 | + * control character (see {@link PicaConstants}) is found or the end of |
148 | 203 | * the record is reached. Only the first occurrence of the sequence is
|
149 | 204 | * processed, later occurrences are ignored.
|
150 | 205 | *
|
|
0 commit comments