|
1 | 1 | from collections import defaultdict |
2 | | -import math |
3 | 2 | from propertysuggester.analyzer.impl.Analyzer import Analyzer |
4 | 3 | from propertysuggester.analyzer.rule import Rule |
5 | 4 |
|
6 | | - |
7 | 5 | class ItemAnalyzer(Analyzer): |
8 | | - def __init__(self): |
| 6 | + def __init__(self, classiying_property_ids = [31,279]): |
9 | 7 | Analyzer.__init__(self) |
10 | | - self.property_occurances = defaultdict(int) |
11 | | - self.pair_occurances = defaultdict(lambda: defaultdict(int)) |
| 8 | + self.classiying_pids = classiying_property_ids |
| 9 | + self.tuple_occurrences = defaultdict(int) |
| 10 | + self.pair_occurrences = defaultdict(lambda: defaultdict(int)) |
12 | 11 |
|
13 | | - def process(self, entity): |
14 | | - distinct_ids = set(claim.mainsnak.property_id for claim in entity.claims) |
15 | | - self._count_occurances(distinct_ids) |
| 12 | + def process(self, item): |
| 13 | + distinct_ids = set(claim.mainsnak.property_id for claim in item.claims) |
| 14 | + property_value_pairs = [(claim.mainsnak.property_id, claim.mainsnak.value) for claim in item.claims] |
| 15 | + self._count_occurrences(distinct_ids, property_value_pairs) |
16 | 16 |
|
17 | | - def _count_occurances(self, distinct_ids): |
| 17 | + def _count_occurrences(self, distinct_ids, property_value_pairs): |
18 | 18 | for pid1 in distinct_ids: |
19 | | - self.property_occurances[pid1] += 1 |
| 19 | + if pid1 in self.classiying_pids: |
| 20 | + continue |
| 21 | + currentTuple = (pid1, None) |
| 22 | + self.tuple_occurrences[currentTuple] += 1 |
20 | 23 | for pid2 in distinct_ids: |
21 | 24 | if pid1 != pid2: |
22 | | - self.pair_occurances[pid1][pid2] += 1 |
| 25 | + self.pair_occurrences[currentTuple][pid2] += 1 |
| 26 | + |
| 27 | + for pid1, value in property_value_pairs: |
| 28 | + if pid1 in self.classiying_pids and value[1:].isdigit(): |
| 29 | + self.tuple_occurrences[pid1, int(value[1:])] += 1 |
| 30 | + for pid2 in distinct_ids: |
| 31 | + if pid1 != pid2: |
| 32 | + self.pair_occurrences[pid1, int(value[1:])][pid2] += 1 |
23 | 33 |
|
24 | 34 | def get_rules(self): |
25 | 35 | rules = [] |
26 | | - totalpropertycount = len(self.property_occurances) |
27 | | - for pid1, row in self.pair_occurances.iteritems(): |
28 | | - sharedpids = len(row) |
29 | | - idf = math.log(totalpropertycount/float(sharedpids)) |
30 | | - pid1count = self.property_occurances[pid1] |
| 36 | + for (pid1, value), row in self.pair_occurrences.iteritems(): |
| 37 | + pid1count = self.tuple_occurrences[pid1, value] |
31 | 38 | for pid2, paircount in row.iteritems(): |
32 | 39 | if paircount > 0: |
33 | | - probability = (paircount/float(pid1count)) * idf |
34 | | - rules.append(Rule(pid1, None, pid2, paircount, probability, "item")) |
| 40 | + probability = (paircount/float(pid1count)) |
| 41 | + rules.append(Rule(pid1, value, pid2, paircount, probability, "item")) |
35 | 42 | return rules |
0 commit comments