Skip to content

Commit 5c8002e

Browse files
committed
Updated PicaDecoder to use StringUtil.copyToBuffer.
The original implementation in PicaDecoder had a bug (see issue #161). By refactoring the code to use the new StringUtil.copyToBuffer method this is fixed.
1 parent 7558607 commit 5c8002e

File tree

1 file changed

+37
-43
lines changed

1 file changed

+37
-43
lines changed

src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java

Lines changed: 37 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,18 @@
2020
import org.culturegraph.mf.framework.annotations.Description;
2121
import org.culturegraph.mf.framework.annotations.In;
2222
import org.culturegraph.mf.framework.annotations.Out;
23+
import org.culturegraph.mf.util.StringUtil;
2324

2425

2526
/**
2627
* <p>Parses pica+ records. The parser only parses single records.
2728
* A string containing multiple records must be split into
2829
* individual records before passing it to {@code PicaDecoder}.</p>
29-
*
30+
*
3031
* <p>The parser is designed to accept any string as valid input and
3132
* to parse pica plain format as well as normalised pica. To
3233
* achieve this, the parser behaves as following:</p>
33-
*
34+
*
3435
* <ul>
3536
* <li>Fields are separated by record markers (0x1d), field
3637
* markers (0x1e) or field end markers (0x0a).</li>
@@ -49,15 +50,15 @@
4950
* is assumed that field names, subfields, subfield names or
5051
* subfield values can be empty.</li>
5152
* </ul>
52-
*
53+
*
5354
* <p>Please not that the record markers is treated as a field
5455
* delimiter and not as a record delimiter. Records need to be
5556
* separated prior to parsing them.</p>
56-
*
57+
*
5758
* <p>As the behaviour of the parser may result in unnamed fields or
5859
* subfields or fields with no subfields the {@code PicaDecoder}
5960
* automatically filters empty fields and subfields:</p>
60-
*
61+
*
6162
* <ul>
6263
* <li>Subfields without a name are ignored (such fields cannot
6364
* have any value because then the first character of the value
@@ -72,30 +73,30 @@
7273
* <li>Input containing only whitespace (spaces and tabs) is
7374
* completely ignored</li>
7475
* </ul>
75-
*
76+
*
7677
* <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
7778
* {@code receiver.endEntity} for each parsed field and
7879
* {@code receiver.literal} for each parsed subfield. Spaces in the
7980
* field name are not included in the entity name. The input
8081
* "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
8182
* sequence of calls:</p>
82-
*
83+
*
8384
* <ol>
8485
* <li>receiver.startEntity("028A")</li>
8586
* <li>receiver.literal("a", "Andy")</li>
8687
* <li>receiver.literal("d", "Warhol")</li>
8788
* <li>receiver.endEntity()</li>
8889
* </ol>
89-
*
90+
*
9091
* <p>The content of subfield 003@$0 is used for the record id. If
9192
* {@code ignoreMissingIdn} is false and field 003@$0 is not found
9293
* in the record a {@link MissingIdException} is thrown.</p>
93-
*
94+
*
9495
* <p>The parser assumes that the input is utf-8 encoded. The parser
9596
* does not support other pica encodings.</p>
96-
*
97+
*
9798
* @author Christoph Böhme
98-
*
99+
*
99100
*/
100101
@Description("Parses pica+ records. The parser only parses single records. " +
101102
"A string containing multiple records must be split into " +
@@ -108,49 +109,50 @@ public final class PicaDecoder
108109
private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};
109110

110111
private static final int BUFFER_SIZE = 1024 * 1024;
111-
112+
112113
private final StringBuilder idBuilder = new StringBuilder();
113114
private final PicaParserContext parserContext = new PicaParserContext();
114-
115+
115116
private char[] buffer = new char[BUFFER_SIZE];
116117
private int recordLen;
117-
118+
118119
private boolean ignoreMissingIdn;
119120

120121
public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
121122
this.ignoreMissingIdn = ignoreMissingIdn;
122123
}
123-
124+
124125
public boolean getIgnoreMissingIdn() {
125126
return ignoreMissingIdn;
126127
}
127-
128+
128129
public void setNormalizeUTF8(final boolean normalizeUTF8) {
129130
parserContext.setNormalizeUTF8(normalizeUTF8);
130131
}
131-
132+
132133
public boolean getNormalizeUTF8() {
133134
return parserContext.getNormalizeUTF8();
134135
}
135-
136+
136137
public void setSkipEmptyFields(final boolean skipEmptyFields) {
137138
parserContext.setSkipEmptyFields(skipEmptyFields);
138139
}
139-
140+
140141
public boolean getSkipEmptyFields() {
141142
return parserContext.getSkipEmptyFields();
142143
}
143-
144+
144145
@Override
145146
public void process(final String record) {
146147
assert !isClosed();
147-
148-
copyToBuffer(record);
149-
148+
149+
buffer = StringUtil.copyToBuffer(record, buffer);
150+
recordLen = record.length();
151+
150152
if (recordIsEmpty()) {
151153
return;
152154
}
153-
155+
154156
String id = extractRecordId();
155157
if (id == null) {
156158
if (!ignoreMissingIdn) {
@@ -165,28 +167,20 @@ public void process(final String record) {
165167
state = state.parseChar(buffer[i], parserContext);
166168
}
167169
state.endOfInput(parserContext);
168-
170+
169171
getReceiver().endRecord();
170172
}
171-
173+
172174
@Override
173175
protected void onSetReceiver() {
174176
parserContext.setReceiver(getReceiver());
175177
}
176-
178+
177179
@Override
178180
protected void onResetStream() {
179181
parserContext.reset();
180182
}
181-
182-
private void copyToBuffer(final String record) {
183-
recordLen = record.length();
184-
if(recordLen > buffer.length) {
185-
buffer = new char[buffer.length * 2];
186-
}
187-
record.getChars(0, recordLen, buffer, 0);
188-
}
189-
183+
190184
private boolean recordIsEmpty() {
191185
for (int i = 0; i < recordLen; ++i) {
192186
if (buffer[i] != ' ' && buffer[i] != '\t') {
@@ -195,23 +189,23 @@ private boolean recordIsEmpty() {
195189
}
196190
return true;
197191
}
198-
192+
199193
/**
200194
* Searches the record for the sequence specified in {@code ID_FIELD}
201195
* and returns all characters following this sequence until the next
202196
* control character (see {@link PicaConstants}) is found or the end of
203197
* the record is reached. Only the first occurrence of the sequence is
204198
* processed, later occurrences are ignored.
205-
*
199+
*
206200
* If the sequence is not found in the string or if it is not followed
207201
* by any characters then {@code null} is returned.
208-
*
202+
*
209203
* @return value of subfield 003@$0 or null if the
210204
* field is not found or is empty.
211205
*/
212206
private String extractRecordId() {
213207
idBuilder.setLength(0);
214-
208+
215209
int fieldPos = 0;
216210
boolean skip = false;
217211
for (int i = 0; i < recordLen; ++i) {
@@ -238,17 +232,17 @@ private String extractRecordId() {
238232
}
239233
}
240234
}
241-
235+
242236
if (idBuilder.length() > 0) {
243237
return idBuilder.toString();
244238
}
245239
return null;
246240
}
247-
241+
248242
private static boolean isFieldDelimiter(final char ch) {
249243
return ch == PicaConstants.RECORD_MARKER
250244
|| ch == PicaConstants.FIELD_MARKER
251245
|| ch == PicaConstants.FIELD_END_MARKER;
252246
}
253-
247+
254248
}

0 commit comments

Comments
 (0)