Skip to content

Commit 4ce0b4c

Browse files
committed
Simplified output of PicaMultiscriptRemodeler
The original version of `PicaMultiscriptRemodeler` used the original script names for the intermediate layer of entities. This makes processing of the output unnecessary complex because of the large number of possible entity names. As currently only three types of scripts need to be distinguished, the module was changed to map the script names to one of the three script types which are then used as entity names.
1 parent 914b58d commit 4ce0b4c

File tree

2 files changed

+77
-12
lines changed

2 files changed

+77
-12
lines changed

src/main/java/org/culturegraph/mf/stream/pipe/PicaMultiscriptRemodeler.java

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,13 @@
3838
* This module scans the input stream for Pica multiscript fields and remodels
3939
* them by merging all fields belong to the same multiscript group into one
4040
* entity. This entity has the original field name. Within this entity a new
41-
* entity for each of the original fields is created. These entities carry the
42-
* name of the script used in the field.
41+
* entity for each of the original fields is created. These entities are named
42+
* depending on the type of script used. Three scripts are distinguished:
43+
* <ul>
44+
* <li>Latin</<li>
45+
* <li>NonLatinLR</li>
46+
* <li>NonLatinRL</li>
47+
* </ul>
4348
*
4449
* The following example shows how the input
4550
*
@@ -56,16 +61,16 @@
5661
*
5762
* <pre>
5863
* 021A {
59-
* Latn { T: 01, U: Latn, a: Title }
60-
* Grek { T: 01, U: Grek, a: Greek title }
64+
* Latin { T: 01, U: Latn, a: Title }
65+
* NonLatinLR { T: 01, U: Grek, a: Greek title }
6166
* }
6267
* 021C {
63-
* Latn { T: 01, U: Latn, a: Subseries A }
64-
* Grek { T: 01, U: Grek, a: Greek subseries A }
68+
* Latin { T: 01, U: Latn, a: Subseries A }
69+
* NonLatinLR { T: 01, U: Grek, a: Greek subseries A }
6570
* }
6671
* 021C {
67-
* Latn { T: 02, U: Latn, a: Subseries B }
68-
* Grek { T: 02, U: Grek, a: Greek subseries B}
72+
* Latin { T: 02, U: Latn, a: Subseries B }
73+
* NonLatinLR { T: 02, U: Grek, a: Greek subseries B}
6974
* }
7075
* </pre>
7176
*
@@ -97,11 +102,19 @@
97102
public final class PicaMultiscriptRemodeler extends
98103
DefaultStreamPipe<StreamReceiver> {
99104

105+
public static final String ENTITY_NAME_FOR_LATIN = "Latin";
106+
public static final String ENTITY_NAME_FOR_NON_LATIN_LR = "NonLatinLR";
107+
public static final String ENTITY_NAME_FOR_NON_LATIN_RL = "NonLatinRL";
108+
100109
private static final BufferedField BEFORE_FIRST_FIELD = new BufferedField("", null);
101110

102111
private static final String GROUP_SUBFIELD = "T";
103112
private static final String SCRIPT_SUBFIELD = "U";
104113

114+
private static final String LATIN_SCRIPT = "Latn";
115+
private static final String ARABIC_SCRIPT = "Arab";
116+
private static final String HEBREW_SCRIPT = "Hebr";
117+
105118
private BufferedField currentField;
106119
private BufferedField lastField;
107120

@@ -178,11 +191,11 @@ private void emitNonMultiscriptField() {
178191
private void emitRemodeledMultiscriptField(final BufferedField firstField, final BufferedField secondField) {
179192
getReceiver().startEntity(firstField.name);
180193

181-
getReceiver().startEntity(firstField.script);
194+
getReceiver().startEntity(mapScriptToEntityName(firstField.script));
182195
firstField.stream.replay();
183196
getReceiver().endEntity();
184197

185-
getReceiver().startEntity(secondField.script);
198+
getReceiver().startEntity(mapScriptToEntityName(secondField.script));
186199
secondField.stream.replay();
187200
getReceiver().endEntity();
188201

@@ -198,6 +211,16 @@ private void emitAsSingleMultiscriptFields(final Map<?, BufferedField> fields) {
198211
fields.clear();
199212
}
200213

214+
private String mapScriptToEntityName(final String script) {
215+
if (LATIN_SCRIPT.equals(script)) {
216+
return ENTITY_NAME_FOR_LATIN;
217+
} else if (ARABIC_SCRIPT.equals(script)
218+
|| HEBREW_SCRIPT.equals(script)) {
219+
return ENTITY_NAME_FOR_NON_LATIN_RL;
220+
}
221+
return ENTITY_NAME_FOR_NON_LATIN_LR;
222+
}
223+
201224
private static class BufferedField {
202225

203226
public String group;

src/test/java/org/culturegraph/mf/stream/pipe/PicaMultiscriptRemodelerTest.java

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,16 @@ public class PicaMultiscriptRemodelerTest {
4141
private static final String FIELD_021C = "021C";
4242
private static final String SCRIPT_LATIN = "Latn";
4343
private static final String SCRIPT_GREEK = "Grek";
44+
private static final String SCRIPT_ARABIC = "Arab";
45+
private static final String SCRIPT_HEBREW = "Hebr";
4446

4547
private static final String VALUE_1 = "Subfield 1";
4648
private static final String VALUE_2 = "Subfield 2";
4749
private static final String VALUE_3 = "Subfield 3";
4850
private static final String VALUE_1_GREEK = "ĸµ 1";
4951
private static final String VALUE_2_GREEK = "ĸµ 2";
52+
private static final String VALUE_1_ARABIC = "Subfield/Arabic 1";
53+
private static final String VALUE_1_HEBREW = "Subfield/Hebrew 1";
5054

5155
private PicaMultiscriptRemodeler remodeler;
5256

@@ -215,6 +219,34 @@ public void shouldPassThroughIncompleteMultiscriptFields() {
215219
ordered.verify(receiver).endRecord();
216220
}
217221

222+
@Test
223+
public void shouldLabelArabicAsNonLatinRightToLeftScript() {
224+
remodeler.startRecord(RECORD_ID);
225+
emitMultscriptField(FIELD_021A, "01", SCRIPT_LATIN, VALUE_1);
226+
emitMultscriptField(FIELD_021A, "01", SCRIPT_ARABIC, VALUE_1_ARABIC);
227+
remodeler.endRecord();
228+
229+
final InOrder ordered = inOrder(receiver);
230+
ordered.verify(receiver).startRecord(RECORD_ID);
231+
verifyMultiscriptField(ordered, FIELD_021A, "01", SCRIPT_LATIN,
232+
VALUE_1, SCRIPT_ARABIC, VALUE_1_ARABIC);
233+
ordered.verify(receiver).endRecord();
234+
}
235+
236+
@Test
237+
public void shouldLabelHebrewAsNonLatinRightToLeftScript() {
238+
remodeler.startRecord(RECORD_ID);
239+
emitMultscriptField(FIELD_021A, "01", SCRIPT_LATIN, VALUE_1);
240+
emitMultscriptField(FIELD_021A, "01", SCRIPT_HEBREW, VALUE_1_HEBREW);
241+
remodeler.endRecord();
242+
243+
final InOrder ordered = inOrder(receiver);
244+
ordered.verify(receiver).startRecord(RECORD_ID);
245+
verifyMultiscriptField(ordered, FIELD_021A, "01", SCRIPT_LATIN,
246+
VALUE_1, SCRIPT_HEBREW, VALUE_1_HEBREW);
247+
ordered.verify(receiver).endRecord();
248+
}
249+
218250
@Test
219251
public void shouldClearStateOnResetStream() {
220252
remodeler.startRecord(RECORD_ID);
@@ -247,12 +279,12 @@ private void verifyMultiscriptField(final InOrder ordered,
247279
final String value1, final String script2, final String value2) {
248280

249281
ordered.verify(receiver).startEntity(field);
250-
ordered.verify(receiver).startEntity(script1);
282+
ordered.verify(receiver).startEntity(mapScriptToEntityName(script1));
251283
ordered.verify(receiver).literal("T", groupNumber);
252284
ordered.verify(receiver).literal("U", script1);
253285
ordered.verify(receiver).literal("a", value1);
254286
ordered.verify(receiver).endEntity();
255-
ordered.verify(receiver).startEntity(script2);
287+
ordered.verify(receiver).startEntity(mapScriptToEntityName(script2));
256288
ordered.verify(receiver).literal("T", groupNumber);
257289
ordered.verify(receiver).literal("U", script2);
258290
ordered.verify(receiver).literal("a", value2);
@@ -270,4 +302,14 @@ private void verifySingleMultiscriptField(final InOrder ordered,
270302
ordered.verify(receiver).endEntity();
271303
}
272304

305+
private String mapScriptToEntityName(final String script) {
306+
if (SCRIPT_LATIN.equals(script)) {
307+
return PicaMultiscriptRemodeler.ENTITY_NAME_FOR_LATIN;
308+
} else if (SCRIPT_ARABIC.equals(script)
309+
|| SCRIPT_HEBREW.equals(script)) {
310+
return PicaMultiscriptRemodeler.ENTITY_NAME_FOR_NON_LATIN_RL;
311+
}
312+
return PicaMultiscriptRemodeler.ENTITY_NAME_FOR_NON_LATIN_LR;
313+
}
314+
273315
}

0 commit comments

Comments
 (0)