Skip to content

Commit 3077498

Browse files
author
mgeipel
committed
Merge branch 'master' of ssh://github.com/culturegraph/metafacture-core
2 parents 271fc14 + 78ffd02 commit 3077498

File tree

4 files changed

+126
-23
lines changed

4 files changed

+126
-23
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.pipe;
17+
18+
import java.text.Normalizer;
19+
import java.text.Normalizer.Form;
20+
21+
import org.culturegraph.mf.framework.DefaultObjectPipe;
22+
import org.culturegraph.mf.framework.ObjectReceiver;
23+
import org.culturegraph.mf.framework.annotations.Description;
24+
import org.culturegraph.mf.framework.annotations.In;
25+
import org.culturegraph.mf.framework.annotations.Out;
26+
27+
/**
28+
* Normalises diacritics in UTF-8 encoded strings.
29+
*
30+
* @author Christoph Böhme
31+
*
32+
*/
33+
@Description("Normalizes diacritics UTF-8 encoded strings.")
34+
@In(String.class)
35+
@Out(String.class)
36+
public final class Utf8Normalizer extends
37+
DefaultObjectPipe<String, ObjectReceiver<String>> {
38+
39+
@Override
40+
public void process(final String str) {
41+
getReceiver().process(Normalizer.normalize(str, Form.NFC));
42+
}
43+
44+
}

src/main/resources/flux-commands.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ draw-uniform-sample org.culturegraph.mf.stream.pipe.UniformSampler
7878

7979
catch-object-exception org.culturegraph.mf.stream.pipe.ObjectExceptionCatcher
8080

81+
normalize-utf8 org.culturegraph.mf.stream.pipe.Utf8Normalizer
8182

8283
morph org.culturegraph.mf.morph.Metamorph
8384
filter org.culturegraph.mf.stream.pipe.Filter

src/test/java/org/culturegraph/mf/morph/functions/NormalizeUTF8Test.java

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,31 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16-
package org.culturegraph.mf.morph.functions;
17-
18-
import static org.junit.Assert.assertEquals;
19-
20-
import org.culturegraph.mf.morph.functions.NormalizeUTF8;
21-
import org.junit.Test;
22-
23-
24-
/**
25-
* @author Christoph Böhme <[email protected]>
26-
*
27-
*/
28-
public final class NormalizeUTF8Test {
29-
30-
private static final String INPUT_STR = "Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
31-
private static final String OUTPUT_STR = "Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
32-
33-
@Test
34-
public void testProcess() {
35-
final NormalizeUTF8 normalize = new NormalizeUTF8();
36-
assertEquals("Normalization incorrect", OUTPUT_STR, normalize.process(INPUT_STR));
37-
}
38-
}
16+
package org.culturegraph.mf.morph.functions;
17+
18+
import static org.junit.Assert.assertEquals;
19+
20+
import org.culturegraph.mf.morph.functions.NormalizeUTF8;
21+
import org.junit.Test;
22+
23+
24+
/**
25+
* @author Christoph Böhme <[email protected]>
26+
*
27+
*/
28+
public final class NormalizeUTF8Test {
29+
30+
// The umlauts in this string are composed of two characters (u and ", e.g.):
31+
private static final String INPUT_STR =
32+
"Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
33+
34+
// The umlauts in this string are individual characters:
35+
private static final String OUTPUT_STR =
36+
"Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
37+
38+
@Test
39+
public void testProcess() {
40+
final NormalizeUTF8 normalize = new NormalizeUTF8();
41+
assertEquals("Normalization incorrect", OUTPUT_STR, normalize.process(INPUT_STR));
42+
}
43+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.pipe;
17+
18+
import static org.junit.Assert.assertEquals;
19+
20+
import org.culturegraph.mf.framework.DefaultObjectReceiver;
21+
import org.junit.Test;
22+
23+
/**
24+
* Tests {@link Utf8Normalizer}.
25+
*
26+
* @author Christoph Böhme
27+
*
28+
*/
29+
public final class Utf8NormalizerTest {
30+
31+
// The umlauts in this string are composed of two characters (u and ", e.g.):
32+
private static final String INPUT_STR =
33+
"Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
34+
35+
// The umlauts in this string are individual characters:
36+
private static final String OUTPUT_STR =
37+
"Bauer, Sigmund: Über den Einfluß der Ackergeräthe auf den Reinertrag.";
38+
39+
@Test
40+
public void testNormalization() {
41+
final Utf8Normalizer normalizer = new Utf8Normalizer();
42+
43+
normalizer.setReceiver(new DefaultObjectReceiver<String>() {
44+
@Override
45+
public void process(final String obj) {
46+
assertEquals(OUTPUT_STR, obj);
47+
}
48+
});
49+
50+
normalizer.process(INPUT_STR);
51+
}
52+
53+
}

0 commit comments

Comments
 (0)