This repository was archived by the owner on Apr 13, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_PositionDependentCharsetCheck.py
More file actions
87 lines (64 loc) · 3.12 KB
/
_PositionDependentCharsetCheck.py
File metadata and controls
87 lines (64 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import SupervisedValidator
import _GeneralCharsetCheck
class DefaultPositionMeasure:
def __init__(self):
pass
name = "from_start"
def extract(self, position_from_start, word_length):
return position_from_start
class ReversePositionMeasure:
def __init__(self):
pass
name = "from_end"
def extract(self, position_from_start, word_length):
return word_length - (position_from_start + 1)
class RelativePositionMeasure:
def __init__(self):
pass
name = "relative"
def extract(self, position_from_start, word_length):
return float(position_from_start + 1) / float(word_length)
class Classifier:
validator = None # type: SupervisedValidator.Validator
known_position_measures = [DefaultPositionMeasure(), ReversePositionMeasure(), RelativePositionMeasure()]
position_maps = []
general_check = None # type: _GeneralCharsetCheck.Classifier
def __init__(self, validator, general_check):
self.validator = validator
self.general_check = general_check
self.__train()
def classify(self, phrase):
sub_phrases = self.validator.split_into_sub_phrases(phrase)
if len(sub_phrases) is 0:
return False
for i, sub_phrase in enumerate(sub_phrases):
for measure in self.known_position_measures:
for pos, char in enumerate(sub_phrase):
measurement = measure.extract(pos, len(sub_phrase))
if measurement not in self.position_maps[i][measure.name]:
continue;
if char not in self.position_maps[i][measure.name][measurement]["chars"]:
# false by position rules
return False
return True
def __train(self):
for i, sub_phrase_list in enumerate(self.validator.valid_sub_phrases):
self.position_maps.append({m.name: {} for m in self.known_position_measures})
for sub_phrase in sub_phrase_list:
for p, char in enumerate(sub_phrase):
for measure in self.known_position_measures:
measure_dict, index = self.position_maps[i][measure.name], measure.extract(p, len(
sub_phrase))
if index in measure_dict:
measure_dict[index]["chars"].add(char)
measure_dict[index]["data_amount"] += 1
else:
measure_dict[index] = {"chars": set(char), "data_amount": 1}
for i, map in enumerate(self.position_maps):
for measure, items in map.items():
for pos, info in items.items():
p, pc = SupervisedValidator.coverage_probability(float(len(self.general_check.sub_phrase_data[i].full_charset)),
float(len(info["chars"])),
float(info["data_amount"]))
if p < 0.01 or pc < 0.9:
info["chars"] = self.general_check.sub_phrase_data[i].full_charset