Skip to content

Commit 40e5764

Browse files
author
mgeipel
committed
added Flux command to calculate co-occurrence metrics (X2, F, Jaccard
etc.)
1 parent de88ee5 commit 40e5764

File tree

2 files changed

+219
-0
lines changed

2 files changed

+219
-0
lines changed
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.pipe.stat;
17+
18+
import java.util.ArrayList;
19+
import java.util.List;
20+
21+
import org.culturegraph.mf.framework.annotations.Description;
22+
import org.culturegraph.mf.framework.annotations.In;
23+
import org.culturegraph.mf.framework.annotations.Out;
24+
import org.culturegraph.mf.types.Triple;
25+
26+
/**
27+
*
28+
*
29+
* @author Markus Geipel
30+
*
31+
*/
32+
@Description("Calculates values for various cooccurrence metrics. The expected inputs are triples containing as subject the var name and as object the count. "
33+
+ "Marginal counts must appear first, joint counts second. Marinal counts must be written as 1:A, Joint counts as 2:A&B")
34+
@In(Triple.class)
35+
@Out(Triple.class)
36+
public final class CooccurrenceMetricCalculator extends AbstractCountProcessor {
37+
38+
/**
39+
* implementation of the different metrics
40+
*/
41+
enum Metric {
42+
X2 {
43+
@Override
44+
double calculate(final int countA, final int countB, final int countAandB, final int total) {
45+
final double o12 = countA - countAandB;
46+
final double o21 = countB - countAandB;
47+
final double o22 = total - countAandB;
48+
final double d = (countAandB * o22) - (o12 * o21);
49+
50+
final double x2 = total * Math.pow(d, 2)
51+
/ ((countAandB + o12) * (countAandB + o21) * (o12 + o22) * (o21 + o22));
52+
return x2 * Math.signum(d);
53+
}
54+
},
55+
F {
56+
@Override
57+
double calculate(final int countA, final int countB, final int countAandB, final int total) {
58+
final double pa = (double) countA / total;
59+
final double pb = (double) countB / total;
60+
final double pab = (double) countAandB / total;
61+
final double precission = pab / pa;
62+
final double recall = pab / pb;
63+
64+
return 2 * precission * recall / (precission + recall);
65+
}
66+
},
67+
PRECISSION {
68+
@Override
69+
double calculate(final int countA, final int countB, final int countAandB, final int total) {
70+
final double pa = (double) countA / total;
71+
final double pab = (double) countAandB / total;
72+
return pab / pa;
73+
}
74+
},
75+
RECALL {
76+
@Override
77+
double calculate(final int countA, final int countB, final int countAandB, final int total) {
78+
final double pb = (double) countB / total;
79+
final double pab = (double) countAandB / total;
80+
return pab / pb;
81+
}
82+
},
83+
JACCARD {
84+
@Override
85+
double calculate(final int countA, final int countB, final int countAandB, final int total) {
86+
return countAandB / (double)(countA + countB - countAandB);
87+
}
88+
};
89+
90+
abstract double calculate(final int countA, final int countB, final int countAandB, final int total);
91+
}
92+
93+
private static final int MIN_COUNT = 5;
94+
95+
private final List<Metric> metrics = new ArrayList<Metric>();
96+
97+
public CooccurrenceMetricCalculator(final String allMetrics) {
98+
final String[] metrics = allMetrics.split("\\s*,\\s*");
99+
setMinCount(MIN_COUNT);
100+
for (String metric : metrics) {
101+
this.metrics.add(Metric.valueOf(metric));
102+
}
103+
}
104+
105+
public CooccurrenceMetricCalculator(final Metric... metrics) {
106+
setMinCount(MIN_COUNT);
107+
for (Metric metric : metrics) {
108+
this.metrics.add(metric);
109+
}
110+
}
111+
112+
@Override
113+
protected void processCount(final String varA, final String varB, final int countA, final int countB,
114+
final int countAandB) {
115+
for (Metric metric : metrics) {
116+
final double value = metric.calculate(countA, countB, countAandB, getTotal());
117+
getReceiver().process(new Triple(varA + "&" + varB, metric.toString(), String.valueOf(value)));
118+
}
119+
}
120+
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright 2013 Deutsche Nationalbibliothek
3+
*
4+
* Licensed under the Apache License, Version 2.0 the "License";
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.culturegraph.mf.stream.pipe.stat;
17+
18+
import static org.junit.Assert.assertEquals;
19+
20+
import org.culturegraph.mf.framework.DefaultObjectReceiver;
21+
import org.culturegraph.mf.framework.ObjectReceiver;
22+
import org.culturegraph.mf.types.Triple;
23+
import org.junit.Test;
24+
import org.mockito.Mockito;
25+
26+
/**
27+
* Tests {@link CooccurrenceMetricCalculator}.
28+
*
29+
* @author Markus Michael Geipel
30+
*
31+
*/
32+
public final class CooccurrenceMetricCalculatorTest {
33+
34+
private static final double DELTA = 0.01;
35+
private static final int TOTAL = 1000;
36+
private static final int COUNT_A = 100;
37+
private static final int COUNT_B = 50;
38+
private static final int COUNT_A_AND_B = 10;
39+
40+
private static final double X2 = 7.1359;
41+
private static final double F = 0.1333;
42+
private static final double RECALL = 0.199;
43+
private static final double PRECISSION = 0.099;
44+
private static final double JACCARD = 0.0714;
45+
46+
@Test
47+
public void testX2() {
48+
assertEquals(X2, CooccurrenceMetricCalculator.Metric.X2.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL),
49+
DELTA);
50+
}
51+
52+
@Test
53+
public void testF() {
54+
assertEquals(F, CooccurrenceMetricCalculator.Metric.F.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL), DELTA);
55+
}
56+
57+
@Test
58+
public void testPrecission() {
59+
assertEquals(PRECISSION,
60+
CooccurrenceMetricCalculator.Metric.PRECISSION.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL), DELTA);
61+
}
62+
63+
@Test
64+
public void testRecall() {
65+
assertEquals(RECALL,
66+
CooccurrenceMetricCalculator.Metric.RECALL.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL), DELTA);
67+
}
68+
69+
@Test
70+
public void testJaccard() {
71+
assertEquals(JACCARD,
72+
CooccurrenceMetricCalculator.Metric.JACCARD.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL), DELTA);
73+
}
74+
75+
@SuppressWarnings("unchecked")
76+
@Test
77+
public void testAll() {
78+
final CooccurrenceMetricCalculator calculator = new CooccurrenceMetricCalculator("X2, F");
79+
final ObjectReceiver<Triple> receiver = Mockito.mock(ObjectReceiver.class);
80+
calculator.setReceiver(receiver);
81+
calculator.process(new Triple("1:", "", Integer.toString(TOTAL)));
82+
calculator.process(new Triple("1:A", "", Integer.toString(COUNT_A)));
83+
calculator.process(new Triple("1:B", "", Integer.toString(COUNT_B)));
84+
calculator.process(new Triple("2:A&B", "", Integer.toString(COUNT_A_AND_B)));
85+
86+
Mockito.verify(receiver).process(new Triple("A&B", CooccurrenceMetricCalculator.Metric.X2.toString(), Double.toString(CooccurrenceMetricCalculator.Metric.X2.calculate(COUNT_A, COUNT_B, COUNT_A_AND_B, TOTAL))));
87+
}
88+
89+
90+
@Test(expected=IllegalArgumentException.class)
91+
public void testIllegalArgument() {
92+
final CooccurrenceMetricCalculator calculator = new CooccurrenceMetricCalculator("X2");
93+
calculator.setReceiver(new DefaultObjectReceiver<Triple>());
94+
calculator.process(new Triple("2:x&x", "", Integer.toString(COUNT_A_AND_B)));
95+
calculator.process(new Triple("1:x", "", Integer.toString(COUNT_B)));
96+
97+
98+
}
99+
}

0 commit comments

Comments
 (0)