Skip to content

Commit db5af0b

Browse files
committed
Merge pull request #12 from Wikidata-lib/use_idf_to_rank_common_properties_lower
try to reduce errors from id-properties
2 parents 31cd047 + ea23779 commit db5af0b

File tree

2 files changed

+10
-4
lines changed

2 files changed

+10
-4
lines changed

propertysuggester/analyzer/impl/MainAnalyzer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import defaultdict
2+
import math
23
from propertysuggester.analyzer.impl.Analyzer import Analyzer
34
from propertysuggester.analyzer.rule import Rule
45

@@ -22,10 +23,13 @@ def _count_occurances(self, distinct_ids):
2223

2324
def get_rules(self):
2425
rules = []
26+
totalpropertycount = len(self.property_occurances)
2527
for pid1, row in self.pair_occurances.iteritems():
28+
sharedpids = len(row)
29+
idf = math.log(totalpropertycount/float(sharedpids))
2630
pid1count = self.property_occurances[pid1]
2731
for pid2, paircount in row.iteritems():
2832
if paircount > 0:
29-
probability = (paircount/float(pid1count))
33+
probability = (paircount/float(pid1count)) * idf
3034
rules.append(Rule(pid1, None, pid2, paircount, probability, "item"))
3135
return rules

propertysuggester/test/analyzer/test_rule_generator.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import unittest
2+
import math
23

34
from testtools import TestCase
45
from testtools.matchers import *
@@ -27,12 +28,13 @@ def setUp(self):
2728

2829
def test_table_generator(self):
2930
rules = list(RuleGenerator.compute_rules(test_data1))
30-
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 0.5, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
31-
31+
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 0.5*math.log(2), "item"),
32+
Rule(373, None, 31, 1, 1.0*math.log(2), "item")]))
3233

3334
def test_table_with_multiple_occurance(self):
3435
rules = list(RuleGenerator.compute_rules(test_data2))
35-
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 1.0, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
36+
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 1.0*math.log(2), "item"),
37+
Rule(373, None, 31, 1, 1.0*math.log(2), "item")]))
3638

3739
def test_table_with_qualifier_and_references(self):
3840
rules = list(RuleGenerator.compute_rules(test_data3))

0 commit comments

Comments
 (0)