Skip to content

Commit 31cd047

Browse files
committed
Merge pull request #10 from Wikidata-lib/suggestQualifiersAndReferences
Suggest qualifiers and references
2 parents d74491b + 5ca01d8 commit 31cd047

22 files changed

+384
-198
lines changed

analyzer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
import time
44

5-
from propertysuggester.analyzer import CsvGenerator, TableGenerator
5+
from propertysuggester.analyzer import CsvGenerator, RuleGenerator
66
from propertysuggester.parser import CsvReader
77
from propertysuggester.utils.CompressedFileType import CompressedFileType
88

@@ -16,8 +16,8 @@
1616

1717
start = time.time()
1818
print "computing table"
19-
t = TableGenerator.compute_table(CsvReader.read_csv(args.input))
19+
rules = RuleGenerator.compute_rules(CsvReader.read_csv(args.input))
2020
print "writing csv"
21-
CsvGenerator.create_pair_csv(t, args.output)
21+
CsvGenerator.create_pair_csv(rules, args.output)
2222
print "done - {0:.2f}s".format(time.time()-start)
2323
print "now import this csv file with PropertySuggester/maintenance/UpdateTable.php"
Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,19 @@
11
import csv
2+
from propertysuggester.analyzer.rule import Rule
23

34

4-
def create_pair_csv(table, out, delimiter=","):
5+
def create_pair_csv(rules, out, delimiter=","):
56
"""
6-
@type table: dict[int, dict]
7+
@type rules: list[Rule]
78
@type out: file or StringIO.StringIO
89
@type delimiter: string
910
"""
1011
csv_writer = csv.writer(out, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
11-
print "properties: {0}".format(len(table))
1212

1313
csv_writer.writerow(("pid1", "qid1", "pid2", "count", "probability", "context"))
1414
rowcount = 0
15-
for pid1, row in table.iteritems():
16-
for pid2, value in row.iteritems():
17-
if pid1 != pid2 and isinstance(pid2, int) and value > 0: # "appearances" is in the same table, ignore them
18-
probability = value/float(row["appearances"])
19-
csv_writer.writerow((pid1, '', pid2, value, probability, 'item'))
20-
rowcount += 1
21-
if not rowcount % 1000:
22-
print "rows {0}".format(rowcount)
15+
for rule in rules:
16+
csv_writer.writerow((rule.pid1, rule.qid1 or '', rule.pid2, rule.count, rule.probability, rule.context))
17+
rowcount += 1
18+
if rowcount % 1000 == 0:
19+
print "rows {0}".format(rowcount)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from collections import defaultdict
2+
import itertools
3+
from propertysuggester.analyzer.impl.MainAnalyzer import ItemAnalyzer
4+
from propertysuggester.analyzer.impl.QualifierReferenceAnalyzer import QualifierAnalyzer, ReferenceAnalyzer
5+
from propertysuggester.analyzer.rule import Rule
6+
from propertysuggester.utils.datamodel import Entity
7+
8+
9+
def compute_rules(entities, min_probability=0.01):
10+
"""
11+
@type entities: collections.Iterable[Entity]
12+
@return: list[Rule]
13+
"""
14+
15+
analyzers = [ItemAnalyzer(), QualifierAnalyzer(), ReferenceAnalyzer()]
16+
17+
for i, entity in enumerate(entities):
18+
if i % 100000 == 0 and i > 0:
19+
print "entities {0}".format(i)
20+
for analyzer in analyzers:
21+
analyzer.process(entity)
22+
23+
rules = filter(lambda rule: rule.probability > min_probability, itertools.chain(*(a.get_rules() for a in analyzers)))
24+
return rules
25+

propertysuggester/analyzer/TableGenerator.py

Lines changed: 0 additions & 43 deletions
This file was deleted.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from collections import defaultdict
2+
from propertysuggester.analyzer.rule import Rule
3+
from propertysuggester.utils.datamodel import Entity
4+
5+
class Analyzer:
6+
def __init__(self):
7+
pass
8+
9+
def process(self, entity):
10+
"""
11+
@type entity: Entity
12+
"""
13+
raise NotImplementedError("Please implement this method")
14+
15+
def get_rules(self):
16+
"""
17+
@return: list[Rule]
18+
"""
19+
raise NotImplementedError("Please implement this method")
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from collections import defaultdict
2+
from propertysuggester.analyzer.impl.Analyzer import Analyzer
3+
from propertysuggester.analyzer.rule import Rule
4+
5+
6+
class ItemAnalyzer(Analyzer):
7+
def __init__(self):
8+
Analyzer.__init__(self)
9+
self.property_occurances = defaultdict(int)
10+
self.pair_occurances = defaultdict(lambda: defaultdict(int))
11+
12+
def process(self, entity):
13+
distinct_ids = set(claim.mainsnak.property_id for claim in entity.claims)
14+
self._count_occurances(distinct_ids)
15+
16+
def _count_occurances(self, distinct_ids):
17+
for pid1 in distinct_ids:
18+
self.property_occurances[pid1] += 1
19+
for pid2 in distinct_ids:
20+
if pid1 != pid2:
21+
self.pair_occurances[pid1][pid2] += 1
22+
23+
def get_rules(self):
24+
rules = []
25+
for pid1, row in self.pair_occurances.iteritems():
26+
pid1count = self.property_occurances[pid1]
27+
for pid2, paircount in row.iteritems():
28+
if paircount > 0:
29+
probability = (paircount/float(pid1count))
30+
rules.append(Rule(pid1, None, pid2, paircount, probability, "item"))
31+
return rules
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from collections import defaultdict
2+
from propertysuggester.analyzer.impl.Analyzer import Analyzer
3+
from propertysuggester.analyzer.rule import Rule
4+
5+
6+
class QualifierAnalyzer(Analyzer):
7+
def __init__(self):
8+
Analyzer.__init__(self)
9+
self.main_occurances = defaultdict(int)
10+
self.qualifier_occurances = defaultdict(lambda: defaultdict(int))
11+
self.context = "qualifier"
12+
13+
def process(self, entity):
14+
for claim in entity.claims:
15+
distinct_pids = set(q.property_id for q in self.get_special(claim))
16+
if len(distinct_pids) > 0:
17+
main_pid = claim.mainsnak.property_id
18+
self.main_occurances[main_pid] += 1
19+
self._count_special_appearances(main_pid, distinct_pids)
20+
21+
def _count_special_appearances(self, mainsnak_id, distinct_ids):
22+
for pid in distinct_ids:
23+
self.qualifier_occurances[mainsnak_id][pid] += 1
24+
25+
def get_special(self, claim):
26+
return claim.qualifiers
27+
28+
def get_rules(self):
29+
rules = []
30+
for main_pid, row in self.qualifier_occurances.iteritems():
31+
maincount = self.main_occurances[main_pid]
32+
for qualifier_pid, paircount in row.iteritems():
33+
if paircount > 0:
34+
probability = paircount/float(maincount)
35+
rules.append(Rule(main_pid, None, qualifier_pid, paircount, probability, self.context))
36+
return rules
37+
38+
39+
class ReferenceAnalyzer(QualifierAnalyzer):
40+
def __init__(self):
41+
QualifierAnalyzer.__init__(self)
42+
self.context = "reference"
43+
44+
def get_special(self, claim):
45+
return claim.references
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__author__ = 'Christian'

propertysuggester/analyzer/rule.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
class Rule:
3+
4+
def __init__(self, pid1, qid1, pid2, count, probability, context):
5+
"""
6+
@type pid1: int
7+
@type qid1: int|None
8+
@type pid2: int
9+
@type count: int
10+
@type probability: float
11+
@type context: string
12+
"""
13+
self.pid1 = pid1
14+
self.qid1 = qid1
15+
self.pid2 = pid2
16+
self.count = count
17+
self.probability = probability
18+
self.context = context
19+
20+
def __eq__(self, other):
21+
return isinstance(other, Rule) and self.__dict__ == other.__dict__
22+
23+
def __str__(self):
24+
return str(self.__dict__)

propertysuggester/parser/CsvReader.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"""
1010
import csv
1111

12-
from propertysuggester.utils.datamodel import Claim, Entity
12+
from propertysuggester.utils.datamodel import Claim, Entity, Snak
1313

1414

1515
def read_csv(input_file, delimiter=","):
@@ -19,21 +19,30 @@ def read_csv(input_file, delimiter=","):
1919
@type delimiter: str
2020
"""
2121
current_title = None
22+
current_claim = None
2223
claims = []
2324
csv_reader = csv.reader(input_file, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL)
2425

25-
for row_count, row in enumerate(csv_reader):
26-
if len(row) != 4:
27-
raise ValueError("Error in line {0}: {1}".format(row_count, row))
28-
title, prop, datatype, value = row
26+
for row in csv_reader:
27+
if len(row) != 5:
28+
print "error: {0}".format(row)
29+
title, typ, property_id, datatype, value = row
2930
if current_title != title:
3031
if current_title is not None:
3132
yield Entity(current_title, claims)
3233
current_title = title
3334
claims = []
34-
claims.append(Claim(int(prop), datatype, value))
35+
snak = Snak(int(property_id), datatype, value)
36+
if typ == 'claim':
37+
current_claim = Claim(snak)
38+
claims.append(current_claim)
39+
elif typ == 'reference':
40+
current_claim.references.append(snak)
41+
elif typ == 'qualifier':
42+
current_claim.qualifiers.append(snak)
43+
else:
44+
print "unknown type: {0}".format(typ)
3545

3646
if not current_title is None:
3747
yield Entity(current_title, claims)
3848

39-
return

0 commit comments

Comments
 (0)