This repository was archived by the owner on Apr 13, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_PredecessorDependentCharsetCheck.py
More file actions
72 lines (62 loc) · 2.87 KB
/
_PredecessorDependentCharsetCheck.py
File metadata and controls
72 lines (62 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import _GeneralCharsetCheck
import SupervisedValidator
class Classifier:
validator = None # type: SupervisedValidator.Validator
general_check = None # type: _GeneralCharsetCheck.Classifier
sub_phrase_function_maps = []
sub_phrase_position_function_maps = []
def __init__(self, validator, general_check):
self.validator = validator
self.general_check = general_check
self.__train()
def classify(self, phrase):
sub_phrases = self.validator.split_into_sub_phrases(phrase)
if len(sub_phrases) is 0:
return False
for i, sub_phrase in enumerate(sub_phrases):
pred_map = self.sub_phrase_function_maps[i]
pred_pos_map = self.sub_phrase_position_function_maps[i]
for pos, char in enumerate(sub_phrase):
prev_pos = pos - 1
while prev_pos >= 0:
key = sub_phrase[prev_pos: pos]
if key in pred_map and char not in pred_map[key]:
return False
key = key + self.validator.reserved_delimiter_sequence + str(prev_pos)
if key in pred_pos_map and char not in pred_pos_map[key]:
return False
prev_pos -= 1
return True
def __train(self):
for i, sub_phrase_list in enumerate(self.validator.valid_sub_phrases):
pred_map = {}
pred_pos_map = {}
for sub_phrase in sub_phrase_list:
for pos, char in enumerate(sub_phrase):
prev_pos = pos - 1
while prev_pos >= 0:
key = sub_phrase[prev_pos: pos]
if key in pred_map:
pred_map[key].append(char)
else:
pred_map[key] = [char]
key = key + self.validator.reserved_delimiter_sequence + str(prev_pos)
if key in pred_pos_map:
pred_pos_map[key].append(char)
else:
pred_pos_map[key] = [char]
prev_pos -= 1
self._check_coverage_for_map(pred_map, i)
self.sub_phrase_function_maps.append(pred_map)
self._check_coverage_for_map(pred_pos_map, i)
self.sub_phrase_position_function_maps.append(pred_pos_map)
def _check_coverage_for_map(self, rule_map, i):
for rule in list(rule_map.keys()):
p, pc = SupervisedValidator.coverage_probability(
float(len(self.general_check.sub_phrase_data[i].full_charset)),
float(len(set(rule_map[rule]))),
float(len(rule_map[rule])))
if p < 0.01 or pc < 0.9:
del rule_map[rule]
else:
rule_map[rule] = set(rule_map[rule])