|
1 |
| -/* |
2 |
| - * Copyright 2013 Deutsche Nationalbibliothek |
3 |
| - * |
4 |
| - * Licensed under the Apache License, Version 2.0 the "License"; |
5 |
| - * you may not use this file except in compliance with the License. |
6 |
| - * You may obtain a copy of the License at |
7 |
| - * |
8 |
| - * http://www.apache.org/licenses/LICENSE-2.0 |
9 |
| - * |
10 |
| - * Unless required by applicable law or agreed to in writing, software |
11 |
| - * distributed under the License is distributed on an "AS IS" BASIS, |
12 |
| - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 |
| - * See the License for the specific language governing permissions and |
14 |
| - * limitations under the License. |
15 |
| - */ |
16 |
| -package org.culturegraph.mf.stream.converter.bib; |
17 |
| - |
18 |
| -import java.text.Normalizer; |
19 |
| -import java.text.Normalizer.Form; |
20 |
| - |
21 |
| -import org.culturegraph.mf.framework.DefaultStreamPipe; |
22 |
| - |
23 |
| -import org.culturegraph.mf.framework.DefaultStreamPipe; |
24 |
| -import org.culturegraph.mf.framework.ObjectReceiver; |
25 |
| -import org.culturegraph.mf.framework.StreamReceiver; |
26 |
| -import org.culturegraph.mf.framework.annotations.Description; |
27 |
| -import org.culturegraph.mf.framework.annotations.In; |
28 |
| -import org.culturegraph.mf.framework.annotations.Out; |
29 |
| - |
30 |
| - |
31 |
| -/** |
32 |
| - * Encodes an event stream in pica+ format. |
33 |
| - * |
34 |
| - * @see PicaEncoder |
35 |
| - * |
36 |
| - * @author Markus Michael Geipel, Christoph Böhme, Yining Li |
37 |
| - * |
38 |
| - */ |
39 |
| -@Description("Encodes a stream in pica+ Format") |
40 |
| -@In(StreamReceiver.class) |
41 |
| -@Out(String.class) |
42 |
| -public class PicaEncoder extends DefaultStreamPipe<ObjectReceiver<String>> { |
43 |
| - |
44 |
| - private static final String FIELD_DELIMITER = "\u001e"; |
45 |
| - private static final String SUB_DELIMITER = "\u001f"; |
46 |
| - private static boolean idnControlSubField = false; |
47 |
| - private StringBuilder builder = new StringBuilder(); |
48 |
| - private String idn=""; |
49 |
| - |
50 |
| - /** |
51 |
| - * For each field in the stream the method calls: |
52 |
| - * <ol> |
53 |
| - * <li>receiver.startEntity</li> |
54 |
| - * <li>receiver.literal for each subfield of the field</li> |
55 |
| - * <li>receiver.endEntity</li> |
56 |
| - * </ol> |
57 |
| - * Fields without any subfield will be skipped.<br> |
58 |
| - * <strong>Special handling of subfield 'S':</strong> the code of |
59 |
| - * "control subfields" (subfield name='S') will be appended to the |
60 |
| - * fieldName. E.g.: 041A $Saxx would be mapped to the fieldName 041Aa, |
61 |
| - * and xx will be ignored. A recovery of such field to original is not implemented. |
62 |
| - * So the encoder cannot identify a S-field. The S-field special processing |
63 |
| - * can be turn of if the call of decode with the option: |
64 |
| - * (appendcontrolsubfield="false") |
65 |
| - * which default is set to true. |
66 |
| - * |
67 |
| - * @param record |
68 |
| - */ |
69 |
| - @Override |
70 |
| - public final void startRecord(final String name) { |
71 |
| - // the name is a idn, which should be found in the encoded data under 003@. |
72 |
| - this.idn = name; |
73 |
| - } |
74 |
| - |
75 |
| - public final boolean compareIdFromRecord(final String gndId) { |
76 |
| - if (this.idn.equals(gndId)) { |
77 |
| - idnControlSubField = false; //only test this context. |
78 |
| - return true; |
79 |
| - } |
80 |
| - throw new MissingIdException(gndId); |
81 |
| - } |
82 |
| - |
83 |
| - |
84 |
| - @Override |
85 |
| - public final void startEntity(final String name) { |
86 |
| - // Here begins a field (i.e. "028A ", which is given in the name. |
87 |
| - // It is unknown, whether there are any subfields in the field. |
88 |
| - builder.append(name.trim()+ " "); |
89 |
| - if (name.trim().equals("003@")) { |
90 |
| - //Time to check nid |
91 |
| - idnControlSubField = true; |
92 |
| - }else { |
93 |
| - //No check is necessary. |
94 |
| - idnControlSubField = false; |
95 |
| - } |
96 |
| - } |
97 |
| - |
98 |
| - @Override |
99 |
| - public final void literal(final String name, final String value) { |
100 |
| - // |
101 |
| - final String value_new = Normalizer.normalize(value, Form.NFD); |
102 |
| - if (idnControlSubField == true){ |
103 |
| - // it is a 003@ field, the same nid delivered with record should follow |
104 |
| - if (compareIdFromRecord(value)) idnControlSubField = false; |
105 |
| - } |
106 |
| - builder.append(SUB_DELIMITER); |
107 |
| - builder.append(name); |
108 |
| - builder.append(value_new); |
109 |
| - } |
110 |
| - |
111 |
| - @Override |
112 |
| - public final void endEntity() { |
113 |
| - builder.append(FIELD_DELIMITER); |
114 |
| - } |
115 |
| - |
116 |
| - @Override |
117 |
| - public final void endRecord() { |
118 |
| - getReceiver().process(builder.toString()); |
119 |
| - builder = new StringBuilder(); |
120 |
| - } |
121 |
| - |
122 |
| -} |
| 1 | +/* |
| 2 | + * Copyright 2013 Deutsche Nationalbibliothek |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 the "License"; |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +package org.culturegraph.mf.stream.converter.bib; |
| 17 | + |
| 18 | +import java.text.Normalizer; |
| 19 | +import java.text.Normalizer.Form; |
| 20 | +import java.util.regex.Matcher; |
| 21 | +import java.util.regex.Pattern; |
| 22 | + |
| 23 | +import org.culturegraph.mf.exceptions.FormatException; |
| 24 | +import org.culturegraph.mf.framework.DefaultStreamPipe; |
| 25 | + |
| 26 | +import org.culturegraph.mf.framework.DefaultStreamPipe; |
| 27 | +import org.culturegraph.mf.framework.ObjectReceiver; |
| 28 | +import org.culturegraph.mf.framework.StreamReceiver; |
| 29 | +import org.culturegraph.mf.framework.annotations.Description; |
| 30 | +import org.culturegraph.mf.framework.annotations.In; |
| 31 | +import org.culturegraph.mf.framework.annotations.Out; |
| 32 | + |
| 33 | + |
| 34 | +/** |
| 35 | + * Encodes an event stream in pica+ format. |
| 36 | + * |
| 37 | + * <strong>Special handling of subfield 'S':</strong> the code of |
| 38 | + * "control subfields" (subfield name='S') will be appended to the fieldName. |
| 39 | + * E.g.: 041A $Saxx would be mapped to the fieldName 041Aa, and xx will be |
| 40 | + * ignored. A recovery of such field to original is not implemented. So the |
| 41 | + * encoder cannot identify an S-field. |
| 42 | + * The S-field special processing can be turned on if the decoder is called |
| 43 | + * with the option: (appendcontrolsubfield="true") |
| 44 | + * The default value of this option is set to "false". |
| 45 | + * |
| 46 | + * @see PicaDecoder |
| 47 | + * |
| 48 | + * @author Yining Li |
| 49 | + * |
| 50 | + */ |
| 51 | +@Description("Encodes a stream in pica+ Format") |
| 52 | +@In(StreamReceiver.class) |
| 53 | +@Out(String.class) |
| 54 | +public final class PicaEncoder extends DefaultStreamPipe<ObjectReceiver<String>> { |
| 55 | + |
| 56 | + private static final String FIELD_DELIMITER = "\u001e"; |
| 57 | + private static final String SUB_DELIMITER = "\u001f"; |
| 58 | + private boolean idnControlSubField; |
| 59 | + private boolean recordOpen; |
| 60 | + private boolean entityOpen; |
| 61 | + private StringBuilder builder = new StringBuilder(); |
| 62 | + private String id=""; |
| 63 | + |
| 64 | + private static final String FIELD_NAME_PATTERN_STRING = "\\d{3}.(/..)?"; |
| 65 | + private static final Pattern FIELD_NAME_PATTERN = Pattern.compile(FIELD_NAME_PATTERN_STRING); |
| 66 | + private boolean ignoreRecordId; |
| 67 | + |
| 68 | + /** |
| 69 | + * For each field in the stream the method calls: |
| 70 | + * <ol> |
| 71 | + * <li>receiver.startEntity</li> |
| 72 | + * <li>receiver.literal for each subfield of the field</li> |
| 73 | + * <li>receiver.endEntity</li> |
| 74 | + * </ol> |
| 75 | + * Fields without any subfield will be skipped.<br> |
| 76 | + * |
| 77 | + * @param record |
| 78 | + */ |
| 79 | + @Override |
| 80 | + public void startRecord(final String recordId) { |
| 81 | + // the name is a idn, which should be found in the encoded data under 003@. |
| 82 | + //any rest of the previous record is cleared before the new begins. |
| 83 | + builder.setLength(0); |
| 84 | + this.id = recordId; |
| 85 | + //Now an entity can be opened. But no literal is allowed. |
| 86 | + this.recordOpen = true; |
| 87 | + this.entityOpen = false; |
| 88 | + } |
| 89 | + |
| 90 | + public void setIgnoreRecordId(final boolean ignoreRecordId) { |
| 91 | + this.ignoreRecordId = ignoreRecordId; |
| 92 | + } |
| 93 | + |
| 94 | + public boolean getIgnoreRecordId() { |
| 95 | + return this.ignoreRecordId; |
| 96 | + } |
| 97 | + |
| 98 | + protected void compareIdFromRecord(final String recordId) { |
| 99 | + if (this.id.equals(recordId)) { |
| 100 | + idnControlSubField = false; //only test this context. |
| 101 | + return; |
| 102 | + } |
| 103 | + throw new MissingIdException(recordId); |
| 104 | + } |
| 105 | + |
| 106 | + |
| 107 | + @Override |
| 108 | + public void startEntity(final String name) { |
| 109 | + // Here begins a field (i.e. "028A ", which is given in the name. |
| 110 | + // It is unknown, whether there are any subfields in the field. |
| 111 | + final Matcher fieldNameMatcher = FIELD_NAME_PATTERN.matcher(name); |
| 112 | + if (fieldNameMatcher.find()) { |
| 113 | + builder.append(name.trim()+ " "); |
| 114 | + } |
| 115 | + else { |
| 116 | + throw new FormatException(name); |
| 117 | + } |
| 118 | + if (name.trim().equals("003@") && !getIgnoreRecordId()) { |
| 119 | + //Time to check record Id in the following subfield. |
| 120 | + idnControlSubField = true; |
| 121 | + }else { |
| 122 | + //No check is necessary. |
| 123 | + idnControlSubField = false; |
| 124 | + } |
| 125 | + //Now literals can be opened. But no entities are allowed. |
| 126 | + if (recordOpen) |
| 127 | + this.entityOpen = true; |
| 128 | + } |
| 129 | + |
| 130 | + @Override |
| 131 | + public void literal(final String name, final String value) { |
| 132 | + //A Subfield has one character or digit exactly. |
| 133 | + if (name.length()!=1){ |
| 134 | + throw new FormatException(name); |
| 135 | + } else if (!entityOpen){ |
| 136 | + throw new FormatException(name); //new exceptions define!!!! tODo |
| 137 | + } |
| 138 | + final String valueNew = Normalizer.normalize(value, Form.NFD); |
| 139 | + if (idnControlSubField){ |
| 140 | + // it is a 003@ field, the same record id delivered with record should follow |
| 141 | + compareIdFromRecord(value); |
| 142 | + } |
| 143 | + builder.append(SUB_DELIMITER); |
| 144 | + builder.append(name); |
| 145 | + builder.append(valueNew); |
| 146 | +} |
| 147 | + |
| 148 | + @Override |
| 149 | + public void endEntity() { |
| 150 | + builder.append(FIELD_DELIMITER); |
| 151 | + //Now an entity can be opened. But no literal is allowed. |
| 152 | + this.entityOpen = false; |
| 153 | + } |
| 154 | + |
| 155 | + @Override |
| 156 | + public void endRecord() { |
| 157 | + getReceiver().process(builder.toString()); |
| 158 | + builder.setLength(0); |
| 159 | + //Now a record can be opened. But no literal and entity are allowed. |
| 160 | + this.recordOpen = false; |
| 161 | + this.entityOpen = false; |
| 162 | + } |
| 163 | + @Override |
| 164 | + protected void onResetStream() { |
| 165 | + builder.setLength(0); |
| 166 | + } |
| 167 | + |
| 168 | +} |
0 commit comments