Skip to content

Commit e2c15af

Browse files
committed
Add encoder for MARC21 records.
The `Marc21Encoder` uses the ISO 2709:2008 record builder introduced in commit 1c13d5a. The stream which is processed by the encoder need to follow certain requirements described in the Javadoc of the class. The stream format is compatible to the stream produces by the `MarcEncoder` and `MarcXmlHandler` modules. The encoder is available as flux command.
1 parent 1c13d5a commit e2c15af

File tree

3 files changed

+266
-0
lines changed

3 files changed

+266
-0
lines changed
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
* Copyright 2014 Christoph Böhme
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter.bib;
17+
18+
import org.culturegraph.mf.exceptions.FormatException;
19+
import org.culturegraph.mf.framework.DefaultStreamPipe;
20+
import org.culturegraph.mf.framework.ObjectReceiver;
21+
import org.culturegraph.mf.framework.StreamReceiver;
22+
import org.culturegraph.mf.framework.annotations.Description;
23+
import org.culturegraph.mf.framework.annotations.In;
24+
import org.culturegraph.mf.framework.annotations.Out;
25+
import org.culturegraph.mf.iso2709.Iso2709Format;
26+
import org.culturegraph.mf.iso2709.RecordBuilder;
27+
import org.culturegraph.mf.iso2709.RecordFormat;
28+
29+
/**
30+
* Encodes a stream in MARC21 format.
31+
*
32+
* <p>MARC21 supports two types of fields: reference fields and data fields.
33+
* Reference fields consist of a name tag and a single value. Data fields have a
34+
* name tag, two indicators and consist of subfields which have an identifier
35+
* each.
36+
* </p>
37+
*
38+
* <p>The {@code Marc21Encoder} encodes a stream as follows:
39+
* </p>
40+
*
41+
* <ul>
42+
* <li>top-level literals are encoded as reference fields. Their name must match
43+
* the requirements for reference field tags in ISO 2709:2008 records.</li>
44+
*
45+
* <li>entities are encoded as data fields. Only one level of entities is
46+
* supported. The entity name must consist of a three letter tag name followed
47+
* by two indicator characters. The tag name must follow the requirements for
48+
* data field tags in ISO 2709:2008 records.</li>
49+
*
50+
* <li>Literals in entities are encoded as subfields. The literal name is used
51+
* as subfield indicator and must therefore be a single character.</li>
52+
*
53+
* <li>If a literal named "leader" is encountered it is treated as a ISO
54+
* 2709:2008 record label and some of its contents (record status,
55+
* implementation codes, user system characters) are copied into the generated
56+
* record</li>
57+
* </ul>
58+
*
59+
* <p>The stream expected by the encoder is compatible to the streams emitted by
60+
* the {@link MarcDecoder} and the {MarcXmlHandler}.
61+
* </p>
62+
*
63+
* <p>The record identifier in {@code startRecord} is ignored. To add an identifier
64+
* to the MARC21 record a reference field with tag name 001 need to be added.
65+
* </p>
66+
*
67+
* @throws FormatException
68+
* if the stream cannot be converted into a MARC21 record.
69+
*
70+
* @author Christoph Böhme
71+
*
72+
*/
73+
@In(StreamReceiver.class)
74+
@Out(String.class)
75+
@Description("Encodes MARC21 records")
76+
public final class Marc21Encoder extends
77+
DefaultStreamPipe<ObjectReceiver<String>> {
78+
79+
public static final String LEADER_LITERAL = "leader";
80+
81+
private static final RecordFormat MARC21 = new RecordFormat();
82+
83+
private final RecordBuilder builder = new RecordBuilder(MARC21);
84+
private final int nameLength;
85+
86+
private boolean inField;
87+
88+
// CHECKSTYLE OFF: MagicNumber
89+
static {
90+
MARC21.setIndicatorLength(2);
91+
MARC21.setIdentifierLength(2);
92+
MARC21.setFieldLengthLength(4);
93+
MARC21.setFieldStartLength(5);
94+
MARC21.setImplDefinedPartLength(0);
95+
}
96+
// CHECKSTYLE ON: MagicNumber
97+
98+
public Marc21Encoder() {
99+
super();
100+
nameLength = Iso2709Format.TAG_LENGTH + MARC21.getIndicatorLength();
101+
}
102+
103+
@Override
104+
public void startRecord(final String identifier) {
105+
inField = false;
106+
builder.reset();
107+
}
108+
109+
@Override
110+
public void endRecord() {
111+
getReceiver().process(builder.toString());
112+
}
113+
114+
@Override
115+
public void startEntity(final String name) {
116+
if (name.length() != nameLength) {
117+
throw new FormatException("invalid entity name: " + name);
118+
}
119+
120+
final String tag = name.substring(0, Iso2709Format.TAG_LENGTH);
121+
final String indicators = name.substring(Iso2709Format.TAG_LENGTH);
122+
builder.startField(tag, indicators);
123+
inField = true;
124+
}
125+
126+
@Override
127+
public void endEntity() {
128+
inField = false;
129+
builder.endField();
130+
}
131+
132+
@Override
133+
public void literal(final String name, final String value) {
134+
if (LEADER_LITERAL.equals(name)) {
135+
setRecordLabel(value);
136+
} else if (inField) {
137+
builder.appendSubfield(name, value);
138+
} else {
139+
builder.appendReferenceField(name, value);
140+
}
141+
}
142+
143+
private void setRecordLabel(final String value) {
144+
if (value.length() != Iso2709Format.RECORD_LABEL_LENGTH) {
145+
throw new FormatException("leader must be 24 characters long");
146+
}
147+
builder.setRecordStatus(value.charAt(Iso2709Format.RECORD_STATUS_POS));
148+
builder.setImplCodes(value.substring(Iso2709Format.IMPL_CODES_START,
149+
Iso2709Format.IMPL_CODES_END));
150+
builder.setSystemChars(value.substring(
151+
Iso2709Format.SYSTEM_CHARS_START,
152+
Iso2709Format.SYSTEM_CHARS_END));
153+
}
154+
155+
}

src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ encode-cgentity org.culturegraph.mf.stream.converter.CGEntityEncoder
4848
encode-formeta org.culturegraph.mf.stream.converter.FormetaEncoder
4949
encode-json org.culturegraph.mf.stream.converter.JsonEncoder
5050
encode-pica org.culturegraph.mf.stream.converter.bib.PicaEncoder
51+
encode-marc21 org.culturegraph.mf.stream.converter.bib.Marc21Encoder
5152

5253
write org.culturegraph.mf.stream.sink.ObjectWriter
5354
write-triples org.culturegraph.mf.stream.sink.TripleWriter
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Copyright 2014 Christoph Böhme
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.converter.bib;
17+
18+
import static org.mockito.Matchers.matches;
19+
import static org.mockito.Mockito.verify;
20+
21+
import org.culturegraph.mf.exceptions.FormatException;
22+
import org.culturegraph.mf.framework.ObjectReceiver;
23+
import org.junit.After;
24+
import org.junit.Before;
25+
import org.junit.Test;
26+
import org.mockito.Mock;
27+
import org.mockito.MockitoAnnotations;
28+
29+
/**
30+
* Tests for class {@link Marc21Encoder}.
31+
*
32+
* @author Christoph Böhme
33+
*
34+
*/
35+
public final class Marc21EncoderTest {
36+
37+
private static final String LEADER_LITERAL = "leader";
38+
39+
private Marc21Encoder marc21Encoder;
40+
41+
@Mock
42+
private ObjectReceiver<String> receiver;
43+
44+
@Before
45+
public void setup() {
46+
MockitoAnnotations.initMocks(this);
47+
marc21Encoder = new Marc21Encoder();
48+
marc21Encoder.setReceiver(receiver);
49+
}
50+
51+
@After
52+
public void cleanup() {
53+
marc21Encoder.closeStream();
54+
}
55+
56+
@Test
57+
public void shouldWriteInformationFromLeaderToRecordLabel() {
58+
marc21Encoder.startRecord("");
59+
marc21Encoder.literal(LEADER_LITERAL, "00000SIMPL2200000SYS450R");
60+
marc21Encoder.endRecord();
61+
62+
verify(receiver).process(matches("^.{5}SIMPL.{7}SYS.*"));
63+
}
64+
65+
@Test(expected = FormatException.class)
66+
public void shouldThrowFormatExceptionIfLeaderDoesNotMatchRecordLabelLength() {
67+
marc21Encoder.startRecord("");
68+
marc21Encoder.literal(LEADER_LITERAL, "too short");
69+
}
70+
71+
@Test
72+
public void shouldOutputTopLevelLiteralsAsReferenceFields() {
73+
marc21Encoder.startRecord("");
74+
marc21Encoder.literal("001", "identifier");
75+
marc21Encoder.endRecord();
76+
77+
verify(receiver).process(
78+
matches(".*001001100000\u001eidentifier\u001e.*"));
79+
}
80+
81+
@Test
82+
public void shouldOutputEntitiesAsDataFields() {
83+
marc21Encoder.startRecord("");
84+
marc21Encoder.startEntity("021a ");
85+
marc21Encoder.endEntity();
86+
marc21Encoder.endRecord();
87+
88+
verify(receiver).process(matches(".*021000300000\u001ea \u001e.*"));
89+
}
90+
91+
@Test
92+
public void shouldOutputLiteralsInEntitiesAsSubfields() {
93+
marc21Encoder.startRecord("");
94+
marc21Encoder.startEntity("021a ");
95+
marc21Encoder.literal("v", "Fritz");
96+
marc21Encoder.literal("n", "Bauer");
97+
marc21Encoder.endEntity();
98+
marc21Encoder.endRecord();
99+
100+
verify(receiver).process(
101+
matches(".*021001700000\u001ea \u001fvFritz\u001fnBauer\u001e.*"));
102+
}
103+
104+
@Test(expected = FormatException.class)
105+
public void shouldThrowFormatExceptionIfEntityNameLengthIsNotFive() {
106+
marc21Encoder.startRecord("");
107+
marc21Encoder.startEntity("012abc");
108+
}
109+
110+
}

0 commit comments

Comments
 (0)