Fix #137: Refactored the pica parser to be more lenient.

cboehme · cboehme · commit ae5a08afd9e3 · 2013-10-28T08:37:56.000+01:00
The parser does expect fields and subfields to end with a field or
subfield marker anymore. This should allow parsing of pica serialisation
variants which use the markers as delimiters as well as variants which
use the markers as indicators.

The fixUnexpectedEOR option was removed as there is no unexpected end of
record anymore.
diff --git a/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java b/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java
@@ -15,7 +15,6 @@
  */
 package org.culturegraph.mf.stream.converter.bib;
 
-import org.culturegraph.mf.exceptions.FormatException;
 import org.culturegraph.mf.framework.DefaultObjectPipe;
 import org.culturegraph.mf.framework.StreamReceiver;
 import org.culturegraph.mf.framework.annotations.Description;
@@ -62,7 +61,6 @@ public final class PicaDecoder
 	private int recordLen;
 	
 	private boolean ignoreMissingIdn;
-	private boolean fixUnexpectedEOR;
 
 	public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) {
 		this.ignoreMissingIdn = ignoreMissingIdn;
@@ -72,14 +70,6 @@ public boolean getIgnoreMissingIdn() {
 		return ignoreMissingIdn;
 	}
 	
-	public void setFixUnexpectedEOR(final boolean fixUnexpectedEOR) {
-		this.fixUnexpectedEOR = fixUnexpectedEOR;
-	}
-	
-	public boolean getFixUnexpectedEOR() {
-		return fixUnexpectedEOR;
-	}
-	
 	public void setNormalizeUTF8(final boolean normalizeUTF8) {
 		parserContext.setNormalizeUTF8(normalizeUTF8);
 	}
@@ -115,18 +105,11 @@ public void process(final String record) {
 		}
 		getReceiver().startRecord(id);
 
-		PicaParserState state = PicaParserState.FIELD_START;
+		PicaParserState state = PicaParserState.FIELD_NAME;
 		for (int i = 0; i < recordLen; ++i) {
 			state = state.parseChar(buffer[i], parserContext);
 		}
-		if (state != PicaParserState.FIELD_START) {
-			if (fixUnexpectedEOR) {
-				state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext);
-				assert state == PicaParserState.FIELD_START;
-			} else {
-				throw new FormatException("Unexpected end of record");
-			}
-		}
+		state.endOfInput(parserContext);
 		
 		getReceiver().endRecord();
 	}
diff --git a/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java b/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaParserState.java
@@ -19,8 +19,8 @@
 /**
  * A parser for PICA+ records. Only single records can be parsed as the parser
  * does not recognise end-of-record markers (usually new lines). The initial
- * parser state is FIELD_START. A valid state for termination is FIELD_START.
- * The parser processes any input, there is no error state.
+ * parser state is FIELD_NAME. All states are valid end states. The parser
+ * processes any input, there is no error state.
  * 
  * The parser ignores spaces in field names. They are not included in the
  * field name.
@@ -34,23 +34,14 @@
  */
 enum PicaParserState {
 	
-	FIELD_START {
-		@Override
-		protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
-			if (ch == PicaConstants.FIELD_DELIMITER || ch == ' ') {
-				return FIELD_START;
-			}
-			return FIELD_NAME.parseChar(ch, ctx);
-		}
-	},
 	FIELD_NAME {
 		@Override
 		protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
 			final PicaParserState next;
 			if (ch == PicaConstants.FIELD_DELIMITER) {
 				ctx.emitStartEntity();
 				ctx.emitEndEntity();
-				next = FIELD_START;
+				next = FIELD_NAME;
 			} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
 				ctx.emitStartEntity();
 				next = SUBFIELD_NAME;
@@ -62,22 +53,33 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
 			}
 			return next;
 		}
+		
+		@Override
+		protected void endOfInput(final PicaParserContext ctx) {
+			ctx.emitStartEntity();
+			ctx.emitEndEntity();
+		}
 	},
 	SUBFIELD_NAME {
 		@Override
 		protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
 			final PicaParserState next;
 			if (ch == PicaConstants.FIELD_DELIMITER) {
 				ctx.emitEndEntity();
-				next = FIELD_START;
+				next = FIELD_NAME;
 			} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
-				next = SUBFIELD_NAME;
+				next = this;
 			} else {
 				ctx.setSubfieldName(ch);
 				next = SUBFIELD_VALUE;
 			}
 			return next;
 		}
+		
+		@Override
+		protected void endOfInput(final PicaParserContext ctx) {
+			ctx.emitEndEntity();
+		}
 	},
 	SUBFIELD_VALUE {
 		@Override
@@ -86,7 +88,7 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
 			if (ch == PicaConstants.FIELD_DELIMITER) {
 				ctx.emitLiteral();
 				ctx.emitEndEntity();
-				next = FIELD_START;
+				next = FIELD_NAME;
 			} else if (ch == PicaConstants.SUBFIELD_DELIMITER) {
 				ctx.emitLiteral();
 				next = SUBFIELD_NAME;
@@ -96,8 +98,16 @@ protected PicaParserState parseChar(final char ch, final PicaParserContext ctx)
 			}
 			return next;
 		}
+		
+		@Override
+		protected void endOfInput(final PicaParserContext ctx) {
+			ctx.emitLiteral();
+			ctx.emitEndEntity();
+		}
 	};
 
 	protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
 	
+	protected abstract void endOfInput(final PicaParserContext ctx);
+	
 }
diff --git a/src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java b/src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java