Updated PicaDecoder to use StringUtil.copyToBuffer.

cboehme · cboehme · commit 5c8002e34dd1 · 2014-02-05T13:10:45.000+01:00
The original implementation in PicaDecoder had a bug (see issue #161). By refactoring the code to use the new StringUtil.copyToBuffer method this is fixed.
diff --git a/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java b/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java
@@ -20,17 +20,18 @@
 import org.culturegraph.mf.framework.annotations.Description;
 import org.culturegraph.mf.framework.annotations.In;
 import org.culturegraph.mf.framework.annotations.Out;
+import org.culturegraph.mf.util.StringUtil;
 
 
 /**
  * <p>Parses pica+ records. The parser only parses single records.
  * A string containing multiple records must be split into
  * individual records before passing it to {@code PicaDecoder}.</p>
- * 
+ *
  * <p>The parser is designed to accept any string as valid input and
  * to parse pica plain format as well as normalised pica. To
  * achieve this, the parser behaves as following:</p>
- * 
+ *
  * <ul>
  * <li>Fields are separated by record markers (0x1d), field
  * markers (0x1e) or field end markers (0x0a).</li>
@@ -49,15 +50,15 @@
  * is assumed that field names, subfields, subfield names or
  * subfield values can be empty.</li>
  * </ul>
- * 
+ *
  * <p>Please not that the record markers is treated as a field
  * delimiter and not as a record delimiter. Records need to be
  * separated prior to parsing them.</p>
- * 
+ *
  * <p>As the behaviour of the parser may result in unnamed fields or
  * subfields or fields with no subfields the {@code PicaDecoder}
  * automatically filters empty fields and subfields:</p>
- * 
+ *
  * <ul>
  * <li>Subfields without a name are ignored (such fields cannot
  * have any value because then the first character of the value
@@ -72,30 +73,30 @@
  * <li>Input containing only whitespace (spaces and tabs) is
  * completely ignored</li>
  * </ul>
- * 
+ *
  * <p>The {@code PicaDecoder} calls {@code receiver.startEntity} and
  * {@code receiver.endEntity} for each parsed field and
  * {@code receiver.literal} for each parsed subfield. Spaces in the
  * field name are not included in the entity name. The input
  * "028A \x1faAndy\x1fdWarhol\x1e" would produce the following
  * sequence of calls:</p>
- * 
+ *
  * <ol>
  * <li>receiver.startEntity("028A")</li>
  * <li>receiver.literal("a", "Andy")</li>
  * <li>receiver.literal("d", "Warhol")</li>
  * <li>receiver.endEntity()</li>
  * </ol>
- * 
+ *
  * <p>The content of subfield 003@$0 is used for the record id. If
  * {@code ignoreMissingIdn} is false and field 003@$0 is not found
  * in the record a {@link MissingIdException} is thrown.</p>
- * 
+ *
  * <p>The parser assumes that the input is utf-8 encoded. The parser
  * does not support other pica encodings.</p>
- * 
+ *
  * @author Christoph Böhme
- * 
+ *
  */
 @Description("Parses pica+ records. The parser only parses single records. " +
 		"A string containing multiple records must be split into " +
@@ -108,49 +109,50 @@ public final class PicaDecoder
 	private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};
 
 	private static final int BUFFER_SIZE = 1024 * 1024;
-	
+
 	private final StringBuilder idBuilder = new StringBuilder();
 	private final PicaParserContext parserContext = new PicaParserContext();
-	
+
 	private char[] buffer = new char[BUFFER_SIZE];
 	private int recordLen;
-	
+
 	private boolean ignoreMissingIdn;
 
 	public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
 		this.ignoreMissingIdn = ignoreMissingIdn;
 	}
-	
+
 	public boolean getIgnoreMissingIdn() {
 		return ignoreMissingIdn;
 	}
-	
+
 	public void setNormalizeUTF8(final boolean normalizeUTF8) {
 		parserContext.setNormalizeUTF8(normalizeUTF8);
 	}
-	
+
 	public boolean getNormalizeUTF8() {
 		return parserContext.getNormalizeUTF8();
 	}
-	
+
 	public void setSkipEmptyFields(final boolean skipEmptyFields) {
 		parserContext.setSkipEmptyFields(skipEmptyFields);
 	}
-	
+
 	public boolean getSkipEmptyFields() {
 		return parserContext.getSkipEmptyFields();
 	}
-	
+
 	@Override
 	public void process(final String record) {
 		assert !isClosed();
-		
-		copyToBuffer(record);
-		
+
+		buffer = StringUtil.copyToBuffer(record, buffer);
+		recordLen = record.length();
+
 		if (recordIsEmpty()) {
 			return;
 		}
-		
+
 		String id = extractRecordId();
 		if (id == null) {
 			if (!ignoreMissingIdn) {
@@ -165,28 +167,20 @@ public void process(final String record) {
 			state = state.parseChar(buffer[i], parserContext);
 		}
 		state.endOfInput(parserContext);
-		
+
 		getReceiver().endRecord();
 	}
-	
+
 	@Override
 	protected void onSetReceiver() {
 		parserContext.setReceiver(getReceiver());
 	}
-	
+
 	@Override
 	protected void onResetStream() {
 		parserContext.reset();
 	}
-	
-	private void copyToBuffer(final String record) {
-		recordLen = record.length();
-		if(recordLen > buffer.length) {
-			buffer = new char[buffer.length * 2];
-		}
-		record.getChars(0, recordLen, buffer, 0);
-	}
-	
+
 	private boolean recordIsEmpty() {
 		for (int i = 0; i < recordLen; ++i) {
 			if (buffer[i] != ' ' && buffer[i] != '\t') {
@@ -195,23 +189,23 @@ private boolean recordIsEmpty() {
 		}
 		return true;
 	}
-	
+
 	/**
 	 * Searches the record for the sequence specified in {@code ID_FIELD}
 	 * and returns all characters following this sequence until the next
 	 * control character (see {@link PicaConstants}) is found or the end of
 	 * the record is reached. Only the first occurrence of the sequence is
 	 * processed, later occurrences are ignored.
-	 * 
+	 *
 	 * If the sequence is not found in the string or if it is not followed
 	 * by any characters then {@code null} is returned.
-	 * 
+	 *
 	 * @return value of subfield 003@$0 or null if the
 	 *         field is not found or is empty.
 	 */
 	private String extractRecordId() {
 		idBuilder.setLength(0);
-		
+
 		int fieldPos = 0;
 		boolean skip = false;
 		for (int i = 0; i < recordLen; ++i) {
@@ -238,17 +232,17 @@ private String extractRecordId() {
 				}
 			}
 		}
-		
+
 		if (idBuilder.length() > 0) {
 			return idBuilder.toString();
 		}
 		return null;
 	}
-	
+
 	private static boolean isFieldDelimiter(final char ch) {
 		return ch == PicaConstants.RECORD_MARKER
 				|| ch == PicaConstants.FIELD_MARKER
 				|| ch == PicaConstants.FIELD_END_MARKER;
 	}
-	
+
 }