Skip to content

Commit 12028c6

Browse files
committed
Added module for grouping Pica multiscript fields.
This commit adds a new stream module which processes streams of Pica+ records with multiscript fields into a new structure which is easier to process with Metamorph. The new module is named `PicaMultiscriptRemodeler` and the corresponding Flux module is called `remodel-pica-multiscript`.
1 parent 048c6ba commit 12028c6

File tree

3 files changed

+497
-0
lines changed

3 files changed

+497
-0
lines changed
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
/*
2+
* Copyright 2014 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.pipe;
17+
18+
import java.util.Map;
19+
import java.util.SortedMap;
20+
import java.util.TreeMap;
21+
22+
import org.culturegraph.mf.framework.DefaultStreamPipe;
23+
import org.culturegraph.mf.framework.StreamReceiver;
24+
import org.culturegraph.mf.framework.annotations.Description;
25+
import org.culturegraph.mf.framework.annotations.In;
26+
import org.culturegraph.mf.framework.annotations.Out;
27+
28+
/**
29+
* Groups multiscript fields in entities.
30+
*
31+
* In Pica records some fields can be repeated if the contain contents with a
32+
* non-latin script. These fields can be recognised by the existence of the
33+
* subfields $U and $T. $U contains the name of the script used for the values
34+
* of the fields. $T contains a number which is used to group fields together
35+
* which contain the same contents in different scripts. All fields having the
36+
* same field name and the same number in $T are considered to belong together.
37+
*
38+
* This module scans the input stream for Pica multiscript fields and remodels
39+
* them by merging all fields belong to the same multiscript group into one
40+
* entity. This entity has the original field name. Within this entity a new
41+
* entity for each of the original fields is created. These entities carry the
42+
* name of the script used in the field.
43+
*
44+
* The following example shows how the input
45+
*
46+
* <pre>
47+
* 021A $T 01 $U Latn $a Title
48+
* 021A $T 01 $U Grek $a Greek title
49+
* 021C $T 01 $U Latn $a Subseries A
50+
* 021C $T 02 $U Latn $a Subseries B
51+
* 021C $T 01 $U Grek $a Greek subseries A
52+
* 021C $T 02 $U Grek $a Greek subseries B
53+
* </pre>
54+
*
55+
* is remodeled into
56+
*
57+
* <pre>
58+
* 021A {
59+
* Latn { T: 01, U: Latn, a: Title }
60+
* Grek { T: 01, U: Grek, a: Greek title }
61+
* }
62+
* 021C {
63+
* Latn { T: 01, U: Latn, a: Subseries A }
64+
* Grek { T: 01, U: Grek, a: Greek subseries A }
65+
* }
66+
* 021C {
67+
* Latn { T: 02, U: Latn, a: Subseries B }
68+
* Grek { T: 02, U: Grek, a: Greek subseries B}
69+
* }
70+
* </pre>
71+
*
72+
* Fields which do not contain subfields $U and $T are passed through the module
73+
* unaffected. If a multiscript field is encountered which only exists in a
74+
* single script it is not remodeled but simply passed through. The module
75+
* assumes that no more than two script-variants of a field exist. If a field
76+
* with more than two variants is encountered then the behaviour of
77+
* {@code PicaMultiscriptRemodeler} is undefined.
78+
*
79+
* The order of the output is determined by the order of the second occurrences
80+
* of the multiscript fields. Multiscript fields without a second occurrences are
81+
* output when the second occurrence of a field with a greater group number is
82+
* encountered.
83+
*
84+
* If a field contains only $U or $T but not both, the field is simply passed
85+
* through.
86+
*
87+
* If the sequence of input events does not follow the Pica record definitions
88+
* (order of fields, nesting of entities) the behaviour of this module is
89+
* undefined.
90+
*
91+
* @author Christoph Böhme
92+
*
93+
*/
94+
@In(StreamReceiver.class)
95+
@Out(StreamReceiver.class)
96+
@Description("Groups multiscript fields in entities")
97+
public final class PicaMultiscriptRemodeler extends
98+
DefaultStreamPipe<StreamReceiver> {
99+
100+
private static final BufferedField BEFORE_FIRST_FIELD = new BufferedField("", null);
101+
102+
private static final Object GROUP_SUBFIELD = "T";
103+
private static final Object SCRIPT_SUBFIELD = "U";
104+
105+
private BufferedField currentField;
106+
private BufferedField lastField;
107+
108+
private final SortedMap<String, BufferedField> bufferedFields = new TreeMap<String, BufferedField>();
109+
110+
@Override
111+
public void startRecord(final String identifier) {
112+
getReceiver().startRecord(identifier);
113+
114+
currentField = null;
115+
lastField = BEFORE_FIRST_FIELD;
116+
117+
bufferedFields.clear();
118+
}
119+
120+
@Override
121+
public void endRecord() {
122+
emitAsSingleMultiscriptFields(bufferedFields);
123+
getReceiver().endRecord();
124+
}
125+
126+
@Override
127+
public void startEntity(final String name) {
128+
currentField = new BufferedField(name);
129+
currentField.stream.setReceiver(getReceiver());
130+
131+
if (!lastField.name.equals(currentField.name)) {
132+
emitAsSingleMultiscriptFields(bufferedFields);
133+
}
134+
}
135+
136+
@Override
137+
public void endEntity() {
138+
if (currentField.group == null || currentField.script == null) {
139+
emitNonMultiscriptField();
140+
} else {
141+
if (bufferedFields.containsKey(currentField.group)) {
142+
emitAsSingleMultiscriptFields(getSingleMultiscriptFieldsBeforeCurrentField());
143+
emitRemodeledMultiscriptField(bufferedFields.remove(currentField.group), currentField);
144+
} else {
145+
bufferMultiscriptField(currentField);
146+
}
147+
}
148+
149+
lastField = currentField;
150+
currentField = null;
151+
}
152+
153+
@Override
154+
public void literal(final String name, final String value) {
155+
currentField.stream.literal(name, value);
156+
157+
if (GROUP_SUBFIELD.equals(name)) {
158+
currentField.group = value;
159+
} else if (SCRIPT_SUBFIELD.equals(name)) {
160+
currentField.script = value;
161+
}
162+
}
163+
164+
private void bufferMultiscriptField(final BufferedField field) {
165+
bufferedFields.put(field.group, field);
166+
}
167+
168+
private Map<?, BufferedField> getSingleMultiscriptFieldsBeforeCurrentField() {
169+
return bufferedFields.headMap(currentField.group);
170+
}
171+
172+
private void emitNonMultiscriptField() {
173+
getReceiver().startEntity(currentField.name);
174+
currentField.stream.replay();
175+
getReceiver().endEntity();
176+
}
177+
178+
private void emitRemodeledMultiscriptField(final BufferedField firstField, final BufferedField secondField) {
179+
getReceiver().startEntity(firstField.name);
180+
181+
getReceiver().startEntity(firstField.script);
182+
firstField.stream.replay();
183+
getReceiver().endEntity();
184+
185+
getReceiver().startEntity(secondField.script);
186+
secondField.stream.replay();
187+
getReceiver().endEntity();
188+
189+
getReceiver().endEntity();
190+
}
191+
192+
private void emitAsSingleMultiscriptFields(final Map<?, BufferedField> fields) {
193+
for (final BufferedField field : fields.values()) {
194+
getReceiver().startEntity(field.name);
195+
field.stream.replay();
196+
getReceiver().endEntity();
197+
}
198+
fields.clear();
199+
}
200+
201+
private static class BufferedField {
202+
203+
public String group;
204+
public String script;
205+
206+
public final String name;
207+
public final StreamBuffer stream;
208+
209+
public BufferedField(final String name) {
210+
this(name, new StreamBuffer());
211+
}
212+
213+
public BufferedField(final String name, final StreamBuffer stream) {
214+
this.group = null;
215+
this.script = null;
216+
this.name = name;
217+
this.stream = stream;
218+
}
219+
220+
}
221+
222+
}

src/main/resources/flux-commands.properties

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,5 @@ filter org.culturegraph.mf.stream.pipe.Filter
106106
add-oreaggregation org.culturegraph.mf.stream.pipe.OreAggregationAdder
107107
digest-file org.culturegraph.mf.stream.pipe.FileDigestCalculator
108108
reorder-triple org.culturegraph.mf.stream.pipe.TripleReorder
109+
110+
remodel-pica-multiscript org.culturegraph.mf.stream.pipe.PicaMultiscriptRemodeler

0 commit comments

Comments
 (0)