Skip to content

Commit 2569586

Browse files
committed
refactor analyzer
1 parent ede0dec commit 2569586

File tree

13 files changed

+211
-166
lines changed

13 files changed

+211
-166
lines changed

analyzer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
import time
44

5-
from propertysuggester.analyzer import CsvGenerator, TableGenerator
5+
from propertysuggester.analyzer import CsvGenerator, RuleGenerator
66
from propertysuggester.parser import CsvReader
77
from propertysuggester.utils.CompressedFileType import CompressedFileType
88

@@ -16,8 +16,8 @@
1616

1717
start = time.time()
1818
print "computing table"
19-
t, q, r = TableGenerator.compute_table(CsvReader.read_csv(args.input))
19+
rules = RuleGenerator.compute_rules(CsvReader.read_csv(args.input))
2020
print "writing csv"
21-
CsvGenerator.create_pair_csv(t, q, r, args.output)
21+
CsvGenerator.create_pair_csv(rules, args.output)
2222
print "done - {0:.2f}s".format(time.time()-start)
2323
print "now import this csv file with PropertySuggester/maintenance/UpdateTable.php"
Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,19 @@
11
import csv
2+
from propertysuggester.analyzer.rule import Rule
23

34

4-
def create_pair_csv(table, table_qualifier, table_references, out, delimiter=","):
5+
def create_pair_csv(rules, out, delimiter=","):
56
"""
6-
@type table: dict[int, dict]
7+
@type rules: list[Rule]
78
@type out: file or StringIO.StringIO
89
@type delimiter: string
910
"""
1011
csv_writer = csv.writer(out, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
11-
print "properties: {0}".format(len(table))
1212

1313
csv_writer.writerow(("pid1", "qid1", "pid2", "count", "probability", "context"))
14-
15-
_write_entries(table, csv_writer, "item")
16-
_write_entries(table_qualifier, csv_writer, "qualifier")
17-
_write_entries(table_references, csv_writer, "reference")
18-
19-
20-
def _write_entries(table, csv_writer, context):
21-
print "Writing entries with context " + context
2214
rowcount = 0
23-
for pid1, row in table.iteritems():
24-
for pid2, value in row.iteritems():
25-
if pid1 != pid2 and isinstance(pid2, int) and value > 0: # "appearances" is in the same table, ignore them
26-
probability = value/float(row["appearances"])
27-
csv_writer.writerow((pid1, '', pid2, value, probability, context))
28-
rowcount += 1
29-
if rowcount % 1000 == 0:
30-
print "rows {0}".format(rowcount)
15+
for rule in rules:
16+
csv_writer.writerow((rule.pid1, rule.qid1 or '', rule.pid2, rule.count, rule.probability, rule.context))
17+
rowcount += 1
18+
if rowcount % 1000 == 0:
19+
print "rows {0}".format(rowcount)
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
from collections import defaultdict
2+
import itertools
3+
from propertysuggester.analyzer.rule import Rule
4+
from propertysuggester.utils.datamodel import Entity
5+
6+
7+
def compute_rules(entities):
8+
"""
9+
@type entities: collections.Iterable[Entity]
10+
@return: list[Rule]
11+
"""
12+
13+
analyzers = [ItemAnalyzer(), QualifierAnalyzer(), ReferenceAnalyzer()]
14+
15+
for i, entity in enumerate(entities):
16+
if i % 100000 == 0 and i > 0:
17+
print "entities {0}".format(i)
18+
for analyzer in analyzers:
19+
analyzer.process(entity)
20+
21+
rules = itertools.chain(*(a.get_rules() for a in analyzers))
22+
return rules
23+
24+
25+
class Analyzer:
26+
def __init__(self, context):
27+
"""
28+
@type context: string
29+
"""
30+
self.propertyOccurances = defaultdict(int)
31+
self.coOccurances = defaultdict(lambda: defaultdict(int))
32+
self.context = context
33+
34+
def process(self, entity):
35+
"""
36+
@type entity: Entity
37+
"""
38+
raise NotImplemented("Please implement this method")
39+
40+
def get_rules(self):
41+
"""
42+
@return: list[Rule]
43+
"""
44+
rules = []
45+
for pid1, row in self.coOccurances.iteritems():
46+
pid1count = self.propertyOccurances[pid1]
47+
for pid2, value in row.iteritems():
48+
if value > 0:
49+
probability = value/float(pid1count)
50+
rules.append(Rule(pid1, None, pid2, pid1count, probability, self.context))
51+
return rules
52+
53+
54+
class ItemAnalyzer(Analyzer):
55+
def __init__(self):
56+
Analyzer.__init__(self, "item")
57+
58+
def process(self, entity):
59+
distinct_ids = set(claim.mainsnak.property_id for claim in entity.claims)
60+
self._count_occurances(distinct_ids)
61+
62+
def _count_occurances(self, distinct_ids):
63+
for pid1 in distinct_ids:
64+
self.propertyOccurances[pid1] += 1
65+
for pid2 in distinct_ids:
66+
if pid1 != pid2:
67+
self.coOccurances[pid1][pid2] += 1
68+
69+
70+
class QualifierAnalyzer(Analyzer):
71+
def __init__(self):
72+
Analyzer.__init__(self, "qualifier")
73+
74+
def process(self, entity):
75+
for claim in entity.claims:
76+
distinct_pids = set(q.property_id for q in self.get_special(claim))
77+
if len(distinct_pids) > 0:
78+
self.propertyOccurances[claim.mainsnak.property_id] += 1
79+
self._count_special_appearances(claim.mainsnak.property_id, distinct_pids)
80+
81+
def _count_special_appearances(self, mainsnak_id, distinct_ids):
82+
for pid in distinct_ids:
83+
self.coOccurances[mainsnak_id][pid] += 1
84+
85+
def get_special(self, claim):
86+
return claim.qualifiers
87+
88+
89+
class ReferenceAnalyzer(QualifierAnalyzer):
90+
def __init__(self):
91+
Analyzer.__init__(self, "reference")
92+
93+
def get_special(self, claim):
94+
return claim.references

propertysuggester/analyzer/TableGenerator.py

Lines changed: 0 additions & 63 deletions
This file was deleted.

propertysuggester/analyzer/rule.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
class Rule:
3+
4+
def __init__(self, pid1, qid1, pid2, count, probability, context):
5+
"""
6+
@type pid1: int
7+
@type qid1: int|None
8+
@type pid2: int
9+
@type count: int
10+
@type probability: float
11+
@type context: string
12+
"""
13+
self.pid1 = pid1
14+
self.qid1 = qid1
15+
self.pid2 = pid2
16+
self.count = count
17+
self.probability = probability
18+
self.context = context
19+
20+
def __eq__(self, other):
21+
return isinstance(other, Rule) and self.__dict__ == other.__dict__
22+
23+
def __str__(self):
24+
return str(self.__dict__)

propertysuggester/parser/CsvReader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def read_csv(input_file, delimiter=","):
2828
print "error: {0}".format(row)
2929
title, typ, property_id, datatype, value = row
3030
if current_title != title:
31-
if not current_title is None:
31+
if current_title is not None:
3232
yield Entity(current_title, claims)
3333
current_title = title
3434
claims = []
@@ -39,7 +39,7 @@ def read_csv(input_file, delimiter=","):
3939
elif typ == 'reference':
4040
current_claim.references.append(snak)
4141
elif typ == 'qualifier':
42-
current_claim.qualifier.append(snak)
42+
current_claim.qualifiers.append(snak)
4343
else:
4444
print "unknown type: {0}".format(typ)
4545

propertysuggester/parser/CsvWriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def write_csv(entities, output_file, delimiter=","):
1515
for claim in entity.claims:
1616
title = entity.title.encode("utf-8")
1717
write_row(csv_writer, title, "claim", claim.mainsnak)
18-
for q in claim.qualifier:
18+
for q in claim.qualifiers:
1919
write_row(csv_writer, title, "qualifier", q)
2020
for ref in claim.references:
2121
write_row(csv_writer, title, "reference", ref)

propertysuggester/test/analyzer/test_csv_generator.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from testtools.matchers import Equals
66

77
from propertysuggester.analyzer import CsvGenerator
8+
from propertysuggester.analyzer.rule import Rule
89

910

1011
class CsvGeneratorTest(TestCase):
@@ -13,13 +14,12 @@ def setUp(self):
1314
self.file = StringIO()
1415

1516
def test_create_table(self):
16-
table = {1: {'appearances': 8, 'type': 'string', 2: 5, 3: 0}}
17-
CsvGenerator.create_pair_csv(table, {}, {}, self.file)
17+
rule = Rule(1, None, 2, 5, 0.3, "item")
18+
CsvGenerator.create_pair_csv([rule], self.file)
1819

1920
self.file.seek(0)
2021
self.assertThat(self.file.readline().strip(), Equals("pid1,qid1,pid2,count,probability,context"))
21-
prob = 5.0 / 8.0
22-
self.assertThat(self.file.readline().strip(), Equals("1,,2,5,{0},item".format(prob)))
22+
self.assertThat(self.file.readline().strip(), Equals("1,,2,5,0.3,item"))
2323

2424

2525
if __name__ == '__main__':
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import unittest
2+
3+
from testtools import TestCase
4+
from testtools.matchers import *
5+
6+
from propertysuggester.analyzer import RuleGenerator
7+
from propertysuggester.analyzer.rule import Rule
8+
from propertysuggester.utils.datamodel import Entity, Claim, Snak
9+
10+
11+
test_data1 = [Entity('Q15', [Claim(Snak(31, 'wikibase-entityid', 'Q5107')),
12+
Claim(Snak(373, 'string', 'Africa'))]),
13+
Entity('Q16', [Claim(Snak(31, 'wikibase-entityid', 'Q384'))])]
14+
15+
test_data2 = [Entity('Q15', [Claim(Snak(31, 'wikibase-entityid', 'Q5107')),
16+
Claim(Snak(373, 'string', 'Africa')),
17+
Claim(Snak(373, 'string', 'Europe'))])]
18+
19+
test_data3 = [Entity('Q15', [Claim(Snak(31, 'wikibase-entityid', 'Q5107'),
20+
[Snak(12, 'wikibase-entityid', 'Q123'), Snak(13, 'string', 'qual')],
21+
[Snak(22, 'wikibase-entityid', 'Q345'), Snak(23, 'string', 'rel')])])]
22+
23+
24+
class RuleGeneratorTest(TestCase):
25+
def setUp(self):
26+
TestCase.setUp(self)
27+
28+
def test_table_generator(self):
29+
rules = list(RuleGenerator.compute_rules(test_data1))
30+
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 2, 0.5, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
31+
32+
33+
def test_table_with_multiple_occurance(self):
34+
rules = list(RuleGenerator.compute_rules(test_data2))
35+
self.assertThat(rules, ContainsAll([Rule(31, None, 373, 1, 1.0, "item"), Rule(373, None, 31, 1, 1.0, "item")]))
36+
37+
def test_table_with_qualifier_and_references(self):
38+
rules = list(RuleGenerator.compute_rules(test_data3))
39+
self.assertThat(rules, ContainsAll([Rule(31, None, 12, 1, 1.0, "qualifier"),
40+
Rule(31, None, 13, 1, 1.0, "qualifier"),
41+
Rule(31, None, 22, 1, 1.0, "reference"),
42+
Rule(31, None, 23, 1, 1.0, "reference")]))
43+
44+
45+
if __name__ == '__main__':
46+
unittest.main()

0 commit comments

Comments
 (0)