|
1 | 1 | /*
|
2 |
| - * Copyright 2013 Deutsche Nationalbibliothek |
| 2 | + * Copyright 2013 Christoph Böhme |
3 | 3 | *
|
4 | 4 | * Licensed under the Apache License, Version 2.0 the "License";
|
5 | 5 | * you may not use this file except in compliance with the License.
|
|
15 | 15 | */
|
16 | 16 | package org.culturegraph.mf.stream.converter.bib;
|
17 | 17 |
|
18 |
| -import java.text.Normalizer; |
19 |
| -import java.text.Normalizer.Form; |
20 |
| -import java.util.regex.Matcher; |
21 |
| -import java.util.regex.Pattern; |
22 |
| - |
23 | 18 | import org.culturegraph.mf.exceptions.FormatException;
|
24 | 19 | import org.culturegraph.mf.framework.DefaultObjectPipe;
|
25 | 20 | import org.culturegraph.mf.framework.StreamReceiver;
|
|
29 | 24 |
|
30 | 25 |
|
31 | 26 | /**
|
32 |
| - * Parses a raw Picaplus stream (utf8 encoding assumed). Events are handled by a |
33 |
| - * {@link StreamReceiver}. |
| 27 | + * Parses a PICA+ record with UTF8 encoding assumed. |
| 28 | + * |
| 29 | + * For each field in the stream the module calls: |
| 30 | + * <ol> |
| 31 | + * <li>receiver.startEntity</li> |
| 32 | + * <li>receiver.literal for each subfield of the field</li> |
| 33 | + * <li>receiver.endEntity</li> |
| 34 | + * </ol> |
| 35 | + * |
| 36 | + * Spaces in the field name are not included in the entity name. |
| 37 | + * |
| 38 | + * Empty subfields are skipped. For instance, processing the following input |
| 39 | + * would NOT produce an empty literal: 003@ \u001f\u001e. The parser also |
| 40 | + * skips unnamed fields without any subfields. |
34 | 41 | *
|
35 |
| - * @see StreamReceiver |
| 42 | + * If {@code ignoreMissingIdn} is false and field 003@$0 is not found in the |
| 43 | + * record a {@link MissingIdException} is thrown. |
36 | 44 | *
|
37 |
| - * @author Markus Michael Geipel, Christoph Böhme |
| 45 | + * @author Christoph Böhme |
38 | 46 | *
|
39 | 47 | */
|
40 |
| -@Description("Parses a raw Picaplus stream (utf8 encoding assumed).") |
| 48 | +@Description("Parses a PICA+ record with UTF8 encoding assumed.") |
41 | 49 | @In(String.class)
|
42 | 50 | @Out(StreamReceiver.class)
|
43 |
| -public final class PicaDecoder |
| 51 | +public final class PicaDecoder |
44 | 52 | extends DefaultObjectPipe<String, StreamReceiver> {
|
45 | 53 |
|
46 |
| - private static final String FIELD_DELIMITER = "\u001e"; |
47 |
| - private static final String SUB_DELIMITER = "\u001f"; |
48 |
| - private static final Pattern FIELD_PATTERN = Pattern.compile( |
49 |
| - FIELD_DELIMITER, Pattern.LITERAL); |
50 |
| - private static final Pattern SUBFIELD_PATTERN = Pattern.compile( |
51 |
| - SUB_DELIMITER, Pattern.LITERAL); |
52 |
| - private static final String ID_PATTERN_STRING = FIELD_DELIMITER + "003@ " |
53 |
| - + SUB_DELIMITER + "0(.*?)" + FIELD_DELIMITER; |
54 |
| - private static final Pattern ID_PATTERN = Pattern |
55 |
| - .compile(ID_PATTERN_STRING); |
56 |
| - private static boolean appendControlSubField = true; |
| 54 | + private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_DELIMITER, '0'}; |
| 55 | + |
| 56 | + private static final int BUFFER_SIZE = 1024 * 1024; |
| 57 | + |
| 58 | + private final StringBuilder idBuilder = new StringBuilder(); |
| 59 | + private final PicaParserContext parserContext = new PicaParserContext(); |
| 60 | + |
| 61 | + private char[] buffer = new char[BUFFER_SIZE]; |
| 62 | + private int recordLen; |
| 63 | + |
| 64 | + private boolean ignoreMissingIdn; |
| 65 | + private boolean fixUnexpectedEOR; |
57 | 66 |
|
58 |
| - /** |
59 |
| - * For each field in the stream the method calls: |
60 |
| - * <ol> |
61 |
| - * <li>receiver.startEntity</li> |
62 |
| - * <li>receiver.literal for each subfield of the field</li> |
63 |
| - * <li>receiver.endEntity</li> |
64 |
| - * </ol> |
65 |
| - * Fields without any subfield will be skipped.<br> |
66 |
| - * <strong>Special handling of subfield 'S':</strong> the code of |
67 |
| - * "control subfields" (subfield name='S') will be appended to the |
68 |
| - * fieldName. E.g.: 041A $Sa would be mapped to the fieldName 041Aa |
69 |
| - * |
70 |
| - * @param record |
71 |
| - */ |
| 67 | + public void setIgnoreMissingIdn(final boolean ignoreMissingIdn) { |
| 68 | + this.ignoreMissingIdn = ignoreMissingIdn; |
| 69 | + } |
| 70 | + |
| 71 | + public boolean getIgnoreMissingIdn() { |
| 72 | + return ignoreMissingIdn; |
| 73 | + } |
| 74 | + |
| 75 | + public void setFixUnexpectedEOR(final boolean fixUnexpectedEOR) { |
| 76 | + this.fixUnexpectedEOR = fixUnexpectedEOR; |
| 77 | + } |
| 78 | + |
| 79 | + public boolean getFixUnexpectedEOR() { |
| 80 | + return fixUnexpectedEOR; |
| 81 | + } |
| 82 | + |
| 83 | + public void setNormalizeUTF8(final boolean normalizeUTF8) { |
| 84 | + parserContext.setNormalizeUTF8(normalizeUTF8); |
| 85 | + } |
| 86 | + |
| 87 | + public boolean getNormalizeUTF8() { |
| 88 | + return parserContext.getNormalizeUTF8(); |
| 89 | + } |
| 90 | + |
| 91 | + public void setSkipEmptyFields(final boolean skipEmptyFields) { |
| 92 | + parserContext.setSkipEmptyFields(skipEmptyFields); |
| 93 | + } |
| 94 | + |
| 95 | + public boolean getSkipEmptyFields() { |
| 96 | + return parserContext.getSkipEmptyFields(); |
| 97 | + } |
| 98 | + |
72 | 99 | @Override
|
73 | 100 | public void process(final String record) {
|
74 | 101 | assert !isClosed();
|
75 |
| - process(record, getReceiver()); |
76 |
| - } |
| 102 | + |
| 103 | + copyToBuffer(record); |
| 104 | + |
| 105 | + if (recordIsEmpty()) { |
| 106 | + return; |
| 107 | + } |
| 108 | + |
| 109 | + String id = extractRecordId(); |
| 110 | + if (id == null) { |
| 111 | + if (!ignoreMissingIdn) { |
| 112 | + throw new MissingIdException("Record has no id"); |
| 113 | + } |
| 114 | + id = ""; |
| 115 | + } |
| 116 | + getReceiver().startRecord(id); |
77 | 117 |
|
78 |
| - public static void setAppendControlSubField(final boolean appendControlSubField) { |
79 |
| - PicaDecoder.appendControlSubField = appendControlSubField; |
| 118 | + PicaParserState state = PicaParserState.FIELD_START; |
| 119 | + for (int i = 0; i < recordLen; ++i) { |
| 120 | + state = state.parseChar(buffer[i], parserContext); |
| 121 | + } |
| 122 | + if (state != PicaParserState.FIELD_START) { |
| 123 | + if (fixUnexpectedEOR) { |
| 124 | + state = state.parseChar(PicaConstants.FIELD_DELIMITER, parserContext); |
| 125 | + assert state == PicaParserState.FIELD_START; |
| 126 | + } else { |
| 127 | + throw new FormatException("Unexpected end of record"); |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + getReceiver().endRecord(); |
| 132 | + } |
| 133 | + |
| 134 | + @Override |
| 135 | + protected void onSetReceiver() { |
| 136 | + parserContext.setReceiver(getReceiver()); |
| 137 | + } |
| 138 | + |
| 139 | + @Override |
| 140 | + protected void onResetStream() { |
| 141 | + parserContext.reset(); |
80 | 142 | }
|
81 | 143 |
|
82 |
| - public static String extractIdFromRecord(final String record) { |
83 |
| - final Matcher idMatcher = ID_PATTERN.matcher(record); |
84 |
| - if (idMatcher.find()) { |
85 |
| - return idMatcher.group(1); |
| 144 | + private void copyToBuffer(final String record) { |
| 145 | + recordLen = record.length(); |
| 146 | + if(recordLen > buffer.length) { |
| 147 | + buffer = new char[buffer.length * 2]; |
86 | 148 | }
|
87 |
| - throw new MissingIdException(record); |
| 149 | + record.getChars(0, recordLen, buffer, 0); |
88 | 150 | }
|
89 | 151 |
|
90 |
| - public static void process(final String rawRecord, final StreamReceiver receiver) { |
91 |
| - if (rawRecord.trim().isEmpty()) { |
92 |
| - return; |
| 152 | + private boolean recordIsEmpty() { |
| 153 | + for (int i = 0; i < recordLen; ++i) { |
| 154 | + if (buffer[i] != ' ' && buffer[i] != '\t') { |
| 155 | + return false; |
| 156 | + } |
93 | 157 | }
|
| 158 | + return true; |
| 159 | + } |
| 160 | + |
| 161 | + private String extractRecordId() { |
| 162 | + idBuilder.setLength(0); |
94 | 163 |
|
95 |
| - final String record = Normalizer.normalize(rawRecord, Form.NFC); |
96 |
| - try { |
97 |
| - receiver.startRecord(extractIdFromRecord(record)); |
98 |
| - |
99 |
| - for (String field : FIELD_PATTERN.split(record)) { |
100 |
| - final String[] subfields = SUBFIELD_PATTERN.split(field); |
101 |
| - if (subfields.length > 1) { |
102 |
| - final String fieldName; |
103 |
| - final int firstSubfield; |
104 |
| - if (subfields[1].charAt(0) == 'S' && appendControlSubField ) { |
105 |
| - fieldName = subfields[0].trim() + subfields[1].charAt(1); |
106 |
| - firstSubfield = 2; |
| 164 | + int fieldPos = 0; |
| 165 | + boolean skip = false; |
| 166 | + for (int i = 0; i < recordLen; ++i) { |
| 167 | + if (buffer[i] == PicaConstants.FIELD_DELIMITER) { |
| 168 | + if (idBuilder.length() > 0) { |
| 169 | + return idBuilder.toString(); |
| 170 | + } |
| 171 | + fieldPos = 0; |
| 172 | + skip = false; |
| 173 | + continue; |
| 174 | + } |
| 175 | + if (!skip) { |
| 176 | + if (fieldPos < ID_FIELD.length) { |
| 177 | + if (buffer[i] == ID_FIELD[fieldPos]) { |
| 178 | + fieldPos += 1; |
107 | 179 | } else {
|
108 |
| - fieldName = subfields[0].trim(); |
109 |
| - firstSubfield = 1; |
| 180 | + skip = true; |
110 | 181 | }
|
111 |
| - |
112 |
| - receiver.startEntity(fieldName); |
113 |
| - |
114 |
| - for (int i = firstSubfield; i < subfields.length; ++i) { |
115 |
| - final String subfield = subfields[i]; |
116 |
| - receiver.literal(subfield.substring(0, 1), |
117 |
| - subfield.substring(1)); |
| 182 | + } else { |
| 183 | + if (buffer[i] == PicaConstants.SUBFIELD_DELIMITER) { |
| 184 | + skip = true; |
| 185 | + } else { |
| 186 | + idBuilder.append(buffer[i]); |
118 | 187 | }
|
119 |
| - receiver.endEntity(); |
120 | 188 | }
|
121 | 189 | }
|
122 |
| - |
123 |
| - receiver.endRecord(); |
124 |
| - } catch (IndexOutOfBoundsException e) { |
125 |
| - throw new FormatException(e); |
126 |
| - } |
| 190 | + } |
| 191 | + |
| 192 | + return null; |
127 | 193 | }
|
128 | 194 |
|
129 | 195 | }
|
0 commit comments