Skip to content

Commit cc26b73

Browse files
committed
Merge pull request #11 from Wikidata-lib/ConsiderClassifyingProperties
Consider classifying properties
2 parents 9f97719 + 34f9af9 commit cc26b73

File tree

6 files changed

+44
-25
lines changed

6 files changed

+44
-25
lines changed

propertysuggester/analyzer/RuleGenerator.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
1+
import ConfigParser
2+
import os
13
from collections import defaultdict
24
import itertools
35
from propertysuggester.analyzer.impl.MainAnalyzer import ItemAnalyzer
46
from propertysuggester.analyzer.impl.QualifierReferenceAnalyzer import QualifierAnalyzer, ReferenceAnalyzer
57
from propertysuggester.analyzer.rule import Rule
68
from propertysuggester.utils.datamodel import Entity
79

10+
config = ConfigParser.ConfigParser()
11+
config.read(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'analyzer.ini'))
12+
classifying_pids = config.get("mainAnalyzer","classifying_properties").split(",")
13+
classifying_pids = map(int, classifying_pids)
814

915
def compute_rules(entities, min_probability=0.01):
1016
"""
1117
@type entities: collections.Iterable[Entity]
1218
@return: list[Rule]
1319
"""
1420

15-
analyzers = [ItemAnalyzer(), QualifierAnalyzer(), ReferenceAnalyzer()]
21+
analyzers = [ItemAnalyzer(classifying_pids), QualifierAnalyzer(), ReferenceAnalyzer()]
1622

1723
for i, entity in enumerate(entities):
1824
if i % 100000 == 0 and i > 0:
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[mainAnalyzer]
2+
#List of classifying properties - at the moment limited to 'instance of' (31) and 'subclass of' (279)
3+
classifying_properties = 31,279
Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,42 @@
11
from collections import defaultdict
2-
import math
32
from propertysuggester.analyzer.impl.Analyzer import Analyzer
43
from propertysuggester.analyzer.rule import Rule
54

6-
75
class ItemAnalyzer(Analyzer):
8-
def __init__(self):
6+
def __init__(self, classiying_property_ids = [31,279]):
97
Analyzer.__init__(self)
10-
self.property_occurances = defaultdict(int)
11-
self.pair_occurances = defaultdict(lambda: defaultdict(int))
8+
self.classiying_pids = classiying_property_ids
9+
self.tuple_occurrences = defaultdict(int)
10+
self.pair_occurrences = defaultdict(lambda: defaultdict(int))
1211

13-
def process(self, entity):
14-
distinct_ids = set(claim.mainsnak.property_id for claim in entity.claims)
15-
self._count_occurances(distinct_ids)
12+
def process(self, item):
13+
distinct_ids = set(claim.mainsnak.property_id for claim in item.claims)
14+
property_value_pairs = [(claim.mainsnak.property_id, claim.mainsnak.value) for claim in item.claims]
15+
self._count_occurrences(distinct_ids, property_value_pairs)
1616

17-
def _count_occurances(self, distinct_ids):
17+
def _count_occurrences(self, distinct_ids, property_value_pairs):
1818
for pid1 in distinct_ids:
19-
self.property_occurances[pid1] += 1
19+
if pid1 in self.classiying_pids:
20+
continue
21+
currentTuple = (pid1, None)
22+
self.tuple_occurrences[currentTuple] += 1
2023
for pid2 in distinct_ids:
2124
if pid1 != pid2:
22-
self.pair_occurances[pid1][pid2] += 1
25+
self.pair_occurrences[currentTuple][pid2] += 1
26+
27+
for pid1, value in property_value_pairs:
28+
if pid1 in self.classiying_pids and value[1:].isdigit():
29+
self.tuple_occurrences[pid1, int(value[1:])] += 1
30+
for pid2 in distinct_ids:
31+
if pid1 != pid2:
32+
self.pair_occurrences[pid1, int(value[1:])][pid2] += 1
2333

2434
def get_rules(self):
2535
rules = []
26-
totalpropertycount = len(self.property_occurances)
27-
for pid1, row in self.pair_occurances.iteritems():
28-
sharedpids = len(row)
29-
idf = math.log(totalpropertycount/float(sharedpids))
30-
pid1count = self.property_occurances[pid1]
36+
for (pid1, value), row in self.pair_occurrences.iteritems():
37+
pid1count = self.tuple_occurrences[pid1, value]
3138
for pid2, paircount in row.iteritems():
3239
if paircount > 0:
33-
probability = (paircount/float(pid1count)) * idf
34-
rules.append(Rule(pid1, None, pid2, paircount, probability, "item"))
40+
probability = (paircount/float(pid1count))
41+
rules.append(Rule(pid1, value, pid2, paircount, probability, "item"))
3542
return rules

propertysuggester/test/analyzer/test_rule_generator.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import unittest
2-
import math
32

43
from testtools import TestCase
54
from testtools.matchers import *
@@ -11,7 +10,8 @@
1110

1211
test_data1 = [Entity('Q15', [Claim(Snak(31, 'wikibase-entityid', 'Q5107')),
1312
Claim(Snak(373, 'string', 'Africa'))]),
14-
Entity('Q16', [Claim(Snak(31, 'wikibase-entityid', 'Q384'))])]
13+
Entity('Q16', [Claim(Snak(31, 'wikibase-entityid', 'Q5107'))]),
14+
Entity('Q17', [Claim(Snak(31, 'wikibase-entityid', 'Q1337'))])]
1515

1616
test_data2 = [Entity('Q15', [Claim(Snak(31, 'wikibase-entityid', 'Q5107')),
1717
Claim(Snak(373, 'string', 'Africa')),
@@ -28,13 +28,12 @@ def setUp(self):
2828

2929
def test_table_generator(self):
3030
rules = list(RuleGenerator.compute_rules(test_data1))
31-
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 0.5*math.log(2), "item"),
32-
Rule(373, None, 31, 1, 1.0*math.log(2), "item")]))
31+
self.assertThat(rules, ContainsAll([Rule(31, 5107, 373, 1, 0.5, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
32+
3333

3434
def test_table_with_multiple_occurance(self):
3535
rules = list(RuleGenerator.compute_rules(test_data2))
36-
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 1.0*math.log(2), "item"),
37-
Rule(373, None, 31, 1, 1.0*math.log(2), "item")]))
36+
self.assertThat(rules, ContainsAll([Rule(31, 5107, 373, 1, 1.0, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
3837

3938
def test_table_with_qualifier_and_references(self):
4039
rules = list(RuleGenerator.compute_rules(test_data3))
7.67 KB
Binary file not shown.

readme.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ nosetests
3030

3131
## Release Notes
3232

33+
### 1.2
34+
* Consider classifying Properties
35+
* use Json dumps for analysis
36+
3337
### 1.1
3438
* Generate associationrules for qualifier and references
3539
* Improve ranking to avoid suggestions of human properties

0 commit comments

Comments
 (0)