Skip to content

Commit 694a7c8

Browse files
committed
Merge branch 'adding-pica-encoder' of http://github.com/liyining/metafacture-core into liyining-adding-pica-encoder
2 parents 6036bb5 + 3445e5e commit 694a7c8

File tree

4 files changed

+381
-42
lines changed

4 files changed

+381
-42
lines changed

.classpath

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,42 @@
1-
<?xml version="1.0" encoding="UTF-8"?>
2-
<classpath>
3-
<classpathentry kind="src" output="target/classes" path="src/main/java">
4-
<attributes>
5-
<attribute name="optional" value="true"/>
6-
<attribute name="maven.pomderived" value="true"/>
7-
</attributes>
8-
</classpathentry>
9-
<classpathentry kind="src" output="target/classes" path="src/main/antlr3">
10-
<attributes>
11-
<attribute name="optional" value="true"/>
12-
<attribute name="maven.pomderived" value="true"/>
13-
</attributes>
14-
</classpathentry>
15-
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
16-
<attributes>
17-
<attribute name="maven.pomderived" value="true"/>
18-
</attributes>
19-
</classpathentry>
20-
<classpathentry including="**/*.java" kind="src" output="target/test-classes" path="src/test/java">
21-
<attributes>
22-
<attribute name="optional" value="true"/>
23-
<attribute name="maven.pomderived" value="true"/>
24-
</attributes>
25-
</classpathentry>
26-
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
27-
<attributes>
28-
<attribute name="maven.pomderived" value="true"/>
29-
</attributes>
30-
</classpathentry>
31-
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
32-
<attributes>
33-
<attribute name="maven.pomderived" value="true"/>
34-
</attributes>
35-
</classpathentry>
36-
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
37-
<attributes>
38-
<attribute name="maven.pomderived" value="true"/>
39-
</attributes>
40-
</classpathentry>
41-
<classpathentry kind="output" path="target/classes"/>
42-
</classpath>
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry kind="src" output="target/classes" path="src/main/java">
4+
<attributes>
5+
<attribute name="optional" value="true"/>
6+
<attribute name="maven.pomderived" value="true"/>
7+
</attributes>
8+
</classpathentry>
9+
<classpathentry kind="src" output="target/classes" path="src/main/antlr3">
10+
<attributes>
11+
<attribute name="optional" value="true"/>
12+
<attribute name="maven.pomderived" value="true"/>
13+
</attributes>
14+
</classpathentry>
15+
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
16+
<attributes>
17+
<attribute name="maven.pomderived" value="true"/>
18+
</attributes>
19+
</classpathentry>
20+
<classpathentry including="**/*.java" kind="src" output="target/test-classes" path="src/test/java">
21+
<attributes>
22+
<attribute name="optional" value="true"/>
23+
<attribute name="maven.pomderived" value="true"/>
24+
</attributes>
25+
</classpathentry>
26+
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
27+
<attributes>
28+
<attribute name="maven.pomderived" value="true"/>
29+
</attributes>
30+
</classpathentry>
31+
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
32+
<attributes>
33+
<attribute name="maven.pomderived" value="true"/>
34+
</attributes>
35+
</classpathentry>
36+
<classpathentry exported="true" kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
37+
<attributes>
38+
<attribute name="maven.pomderived" value="true"/>
39+
</attributes>
40+
</classpathentry>
41+
<classpathentry kind="output" path="target/classes"/>
42+
</classpath>
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter.bib;
17+
18+
import java.text.Normalizer;
19+
import java.text.Normalizer.Form;
20+
import java.util.regex.Matcher;
21+
import java.util.regex.Pattern;
22+
23+
import org.culturegraph.mf.exceptions.FormatException;
24+
import org.culturegraph.mf.framework.DefaultStreamPipe;
25+
26+
import org.culturegraph.mf.framework.DefaultStreamPipe;
27+
import org.culturegraph.mf.framework.ObjectReceiver;
28+
import org.culturegraph.mf.framework.StreamReceiver;
29+
import org.culturegraph.mf.framework.annotations.Description;
30+
import org.culturegraph.mf.framework.annotations.In;
31+
import org.culturegraph.mf.framework.annotations.Out;
32+
33+
34+
/**
35+
* Encodes an event stream in pica+ format.
36+
*
37+
* <strong>Special handling of subfield 'S':</strong> the code of
38+
* "control subfields" (subfield name='S') will be appended to the fieldName.
39+
* E.g.: 041A $Saxx would be mapped to the fieldName 041Aa, and xx will be
40+
* ignored. A recovery of such field to original is not implemented. So the
41+
* encoder cannot identify an S-field.
42+
* The S-field special processing can be turned on if the decoder is called
43+
* with the option: (appendcontrolsubfield="true")
44+
* The default value of this option is set to "false".
45+
*
46+
* @see PicaDecoder
47+
*
48+
* @author Yining Li
49+
*
50+
*/
51+
@Description("Encodes a stream in pica+ Format")
52+
@In(StreamReceiver.class)
53+
@Out(String.class)
54+
public final class PicaEncoder extends DefaultStreamPipe<ObjectReceiver<String>> {
55+
56+
private static final String FIELD_DELIMITER = "\u001e";
57+
private static final String SUB_DELIMITER = "\u001f";
58+
private boolean idnControlSubField;
59+
private boolean entityOpen;
60+
private StringBuilder builder = new StringBuilder();
61+
private String id="";
62+
63+
private static final String FIELD_NAME_PATTERN_STRING = "\\d{3}.(/..)?";
64+
private static final Pattern FIELD_NAME_PATTERN = Pattern.compile(FIELD_NAME_PATTERN_STRING);
65+
private boolean ignoreRecordId;
66+
67+
68+
@Override
69+
public void startRecord(final String recordId) {
70+
// the name is a idn, which should be found in the encoded data under 003@.
71+
//any rest of the previous record is cleared before the new begins.
72+
builder.setLength(0);
73+
this.id = recordId;
74+
//Now an entity can be opened. But no literal is allowed.
75+
this.entityOpen = false;
76+
}
77+
78+
public void setIgnoreRecordId(final boolean ignoreRecordId) {
79+
this.ignoreRecordId = ignoreRecordId;
80+
}
81+
82+
public boolean getIgnoreRecordId() {
83+
return this.ignoreRecordId;
84+
}
85+
86+
@Override
87+
public void startEntity(final String name) {
88+
// Here begins a field (i.e. "028A ", which is given in the name.
89+
// It is unknown, whether there are any subfields in the field.
90+
final Matcher fieldNameMatcher = FIELD_NAME_PATTERN.matcher(name);
91+
if (!fieldNameMatcher.matches()) {
92+
throw new FormatException(name);
93+
}
94+
builder.append(name.trim()+ " ");
95+
96+
idnControlSubField = !ignoreRecordId && name.trim().equals("003@");
97+
//Now literals can be opened.
98+
this.entityOpen = true;
99+
}
100+
101+
@Override
102+
public void literal(final String name, final String value) {
103+
//A Subfield has one character or digit exactly.
104+
if (name.length()!=1){
105+
throw new FormatException(name);
106+
}
107+
if (!entityOpen){
108+
throw new FormatException(name); //new exceptions definition for literal out of entity
109+
}
110+
final String valueNew = Normalizer.normalize(value, Form.NFD);
111+
if (idnControlSubField){
112+
// it is a 003@ field, the same record id delivered with record should follow
113+
if (!this.id.equals(value)) {
114+
throw new MissingIdException(value);
115+
}
116+
idnControlSubField = false; //only one record Id will be checked.
117+
}
118+
builder.append(SUB_DELIMITER);
119+
builder.append(name);
120+
builder.append(valueNew);
121+
}
122+
123+
@Override
124+
public void endEntity() {
125+
builder.append(FIELD_DELIMITER);
126+
//Now an entity can be opened. But no literal is allowed.
127+
this.entityOpen = false;
128+
}
129+
130+
@Override
131+
public void endRecord() {
132+
getReceiver().process(builder.toString());
133+
//No literal is allowed.
134+
this.entityOpen = false;
135+
}
136+
@Override
137+
protected void onResetStream() {
138+
builder.setLength(0);
139+
}
140+
141+
}

src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ encode-literals org.culturegraph.mf.stream.converter.StreamLiteralFormater
4444
encode-cgentity org.culturegraph.mf.stream.converter.CGEntityEncoder
4545
encode-formeta org.culturegraph.mf.stream.converter.FormetaEncoder
4646
encode-json org.culturegraph.mf.stream.converter.JsonEncoder
47+
encode-pica org.culturegraph.mf.stream.converter.bib.PicaEncoder
4748

4849
write org.culturegraph.mf.stream.sink.ObjectWriter
4950
write-triples org.culturegraph.mf.stream.sink.TripleWriter

0 commit comments

Comments
 (0)