20
20
import org .culturegraph .mf .framework .annotations .Description ;
21
21
import org .culturegraph .mf .framework .annotations .In ;
22
22
import org .culturegraph .mf .framework .annotations .Out ;
23
+ import org .culturegraph .mf .util .StringUtil ;
23
24
24
25
25
26
/**
26
27
* <p>Parses pica+ records. The parser only parses single records.
27
28
* A string containing multiple records must be split into
28
29
* individual records before passing it to {@code PicaDecoder}.</p>
29
- *
30
+ *
30
31
* <p>The parser is designed to accept any string as valid input and
31
32
* to parse pica plain format as well as normalised pica. To
32
33
* achieve this, the parser behaves as following:</p>
33
- *
34
+ *
34
35
* <ul>
35
36
* <li>Fields are separated by record markers (0x1d), field
36
37
* markers (0x1e) or field end markers (0x0a).</li>
49
50
* is assumed that field names, subfields, subfield names or
50
51
* subfield values can be empty.</li>
51
52
* </ul>
52
- *
53
+ *
53
54
* <p>Please not that the record markers is treated as a field
54
55
* delimiter and not as a record delimiter. Records need to be
55
56
* separated prior to parsing them.</p>
56
- *
57
+ *
57
58
* <p>As the behaviour of the parser may result in unnamed fields or
58
59
* subfields or fields with no subfields the {@code PicaDecoder}
59
60
* automatically filters empty fields and subfields:</p>
60
- *
61
+ *
61
62
* <ul>
62
63
* <li>Subfields without a name are ignored (such fields cannot
63
64
* have any value because then the first character of the value
72
73
* <li>Input containing only whitespace (spaces and tabs) is
73
74
* completely ignored</li>
74
75
* </ul>
75
- *
76
+ *
76
77
* <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
77
78
* {@code receiver.endEntity} for each parsed field and
78
79
* {@code receiver.literal} for each parsed subfield. Spaces in the
79
80
* field name are not included in the entity name. The input
80
81
* "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
81
82
* sequence of calls:</p>
82
- *
83
+ *
83
84
* <ol>
84
85
* <li>receiver.startEntity("028A")</li>
85
86
* <li>receiver.literal("a", "Andy")</li>
86
87
* <li>receiver.literal("d", "Warhol")</li>
87
88
* <li>receiver.endEntity()</li>
88
89
* </ol>
89
- *
90
+ *
90
91
* <p>The content of subfield 003@$0 is used for the record id. If
91
92
* {@code ignoreMissingIdn} is false and field 003@$0 is not found
92
93
* in the record a {@link MissingIdException} is thrown.</p>
93
- *
94
+ *
94
95
* <p>The parser assumes that the input is utf-8 encoded. The parser
95
96
* does not support other pica encodings.</p>
96
- *
97
+ *
97
98
* @author Christoph Böhme
98
- *
99
+ *
99
100
*/
100
101
@ Description ("Parses pica+ records. The parser only parses single records. " +
101
102
"A string containing multiple records must be split into " +
@@ -108,49 +109,50 @@ public final class PicaDecoder
108
109
private static final char [] ID_FIELD = {'0' , '0' , '3' , '@' , ' ' , PicaConstants .SUBFIELD_MARKER , '0' };
109
110
110
111
private static final int BUFFER_SIZE = 1024 * 1024 ;
111
-
112
+
112
113
private final StringBuilder idBuilder = new StringBuilder ();
113
114
private final PicaParserContext parserContext = new PicaParserContext ();
114
-
115
+
115
116
private char [] buffer = new char [BUFFER_SIZE ];
116
117
private int recordLen ;
117
-
118
+
118
119
private boolean ignoreMissingIdn ;
119
120
120
121
public void setIgnoreMissingIdn (final boolean ignoreMissingIdn ) {
121
122
this .ignoreMissingIdn = ignoreMissingIdn ;
122
123
}
123
-
124
+
124
125
public boolean getIgnoreMissingIdn () {
125
126
return ignoreMissingIdn ;
126
127
}
127
-
128
+
128
129
public void setNormalizeUTF8 (final boolean normalizeUTF8 ) {
129
130
parserContext .setNormalizeUTF8 (normalizeUTF8 );
130
131
}
131
-
132
+
132
133
public boolean getNormalizeUTF8 () {
133
134
return parserContext .getNormalizeUTF8 ();
134
135
}
135
-
136
+
136
137
public void setSkipEmptyFields (final boolean skipEmptyFields ) {
137
138
parserContext .setSkipEmptyFields (skipEmptyFields );
138
139
}
139
-
140
+
140
141
public boolean getSkipEmptyFields () {
141
142
return parserContext .getSkipEmptyFields ();
142
143
}
143
-
144
+
144
145
@ Override
145
146
public void process (final String record ) {
146
147
assert !isClosed ();
147
-
148
- copyToBuffer (record );
149
-
148
+
149
+ buffer = StringUtil .copyToBuffer (record , buffer );
150
+ recordLen = record .length ();
151
+
150
152
if (recordIsEmpty ()) {
151
153
return ;
152
154
}
153
-
155
+
154
156
String id = extractRecordId ();
155
157
if (id == null ) {
156
158
if (!ignoreMissingIdn ) {
@@ -165,28 +167,20 @@ public void process(final String record) {
165
167
state = state .parseChar (buffer [i ], parserContext );
166
168
}
167
169
state .endOfInput (parserContext );
168
-
170
+
169
171
getReceiver ().endRecord ();
170
172
}
171
-
173
+
172
174
@ Override
173
175
protected void onSetReceiver () {
174
176
parserContext .setReceiver (getReceiver ());
175
177
}
176
-
178
+
177
179
@ Override
178
180
protected void onResetStream () {
179
181
parserContext .reset ();
180
182
}
181
-
182
- private void copyToBuffer (final String record ) {
183
- recordLen = record .length ();
184
- if (recordLen > buffer .length ) {
185
- buffer = new char [buffer .length * 2 ];
186
- }
187
- record .getChars (0 , recordLen , buffer , 0 );
188
- }
189
-
183
+
190
184
private boolean recordIsEmpty () {
191
185
for (int i = 0 ; i < recordLen ; ++i ) {
192
186
if (buffer [i ] != ' ' && buffer [i ] != '\t' ) {
@@ -195,23 +189,23 @@ private boolean recordIsEmpty() {
195
189
}
196
190
return true ;
197
191
}
198
-
192
+
199
193
/**
200
194
* Searches the record for the sequence specified in {@code ID_FIELD}
201
195
* and returns all characters following this sequence until the next
202
196
* control character (see {@link PicaConstants}) is found or the end of
203
197
* the record is reached. Only the first occurrence of the sequence is
204
198
* processed, later occurrences are ignored.
205
- *
199
+ *
206
200
* If the sequence is not found in the string or if it is not followed
207
201
* by any characters then {@code null} is returned.
208
- *
202
+ *
209
203
* @return value of subfield 003@$0 or null if the
210
204
* field is not found or is empty.
211
205
*/
212
206
private String extractRecordId () {
213
207
idBuilder .setLength (0 );
214
-
208
+
215
209
int fieldPos = 0 ;
216
210
boolean skip = false ;
217
211
for (int i = 0 ; i < recordLen ; ++i ) {
@@ -238,17 +232,17 @@ private String extractRecordId() {
238
232
}
239
233
}
240
234
}
241
-
235
+
242
236
if (idBuilder .length () > 0 ) {
243
237
return idBuilder .toString ();
244
238
}
245
239
return null ;
246
240
}
247
-
241
+
248
242
private static boolean isFieldDelimiter (final char ch ) {
249
243
return ch == PicaConstants .RECORD_MARKER
250
244
|| ch == PicaConstants .FIELD_MARKER
251
245
|| ch == PicaConstants .FIELD_END_MARKER ;
252
246
}
253
-
247
+
254
248
}
0 commit comments