Skip to content

Commit 6a1eddf

Browse files
committed
Add scoring rules configuration and policy analysis functionality
- Introduced a new TOML configuration file for scoring rules, including weights for transparency, user control, security, and data sharing, as well as thresholds for grading. - Implemented a PolicyAnalysis module in `policy_analysis.py` to handle document extraction from PDF and HTML sources, utilizing BeautifulSoup for HTML parsing and pymupdf4llm for PDF processing. - Created classes for managing policy documents and their analysis results, including methods for extracting text and generating graph image paths.
1 parent 5db9ce3 commit 6a1eddf

File tree

6 files changed

+1186
-616
lines changed

6 files changed

+1186
-616
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
[scoring]
2+
transparency_weight = 40
3+
user_control_weight = 20
4+
security_weight = 20
5+
data_sharing_weight = 20
6+
7+
[thresholds]
8+
grade_a = 90
9+
grade_b = 75
10+
grade_c = 50
11+
grade_d = 25
12+
grade_f = 0
13+
14+
[keywords]
15+
user_control = [
16+
"opt-out",
17+
"opt-in",
18+
"privacy settings",
19+
"preferences",
20+
"user choice"
21+
]
22+
23+
security = [
24+
"security",
25+
"encryption",
26+
"data protection",
27+
"safeguards",
28+
"access control"
29+
]
30+
31+
purpose = [
32+
"we use",
33+
"we collect",
34+
"purpose of",
35+
"used for",
36+
"collected for",
37+
"we process",
38+
"data is used"
39+
]

poligrapher/analysis/privacy_scorer.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,49 +9,50 @@ def __init__(self, scoring_rules=None):
99
self.rules = self.load_default_rules()
1010
else:
1111
self.rules = scoring_rules
12-
12+
1313
# Load detailed scoring criteria
1414
self.criteria = self.load_scoring_criteria()
15-
15+
1616
def load_default_rules(self) -> Dict:
1717
"""Load default scoring rules from config file"""
18-
rules_path = Path("privacy/config/scoring_rules.toml")
18+
rules_path = Path("poligrapher/analysis/config/scoring_rules.toml")
1919
if not rules_path.exists():
2020
raise FileNotFoundError(f"Scoring rules not found at: {rules_path}")
2121
return toml.load(rules_path)
22-
22+
2323
def load_scoring_criteria(self) -> Dict:
2424
"""Load detailed scoring criteria"""
25-
criteria_path = Path("privacy_scorer/criteria/scoring_criteria.toml")
25+
criteria_path = Path("poligrapher/analysis/criteria/scoring_criteria.toml")
2626
if not criteria_path.exists():
2727
raise FileNotFoundError(f"Scoring criteria not found at: {criteria_path}")
2828
return toml.load(criteria_path)
29-
29+
3030
def score_policy(self, policy_text: str) -> Dict[str, Any]:
3131
"""Score the entire privacy policy"""
3232
if not policy_text:
3333
return self._create_error_result("Empty policy text")
34-
34+
3535
try:
3636
total_score = 0.0
3737
category_scores = {}
38-
38+
3939
# Score each category
4040
for category in self.criteria['categories'].keys():
4141
result = self._score_category(category, policy_text)
4242
category_scores[category] = result
4343
total_score += result['weighted_score']
44-
44+
4545
grade = self._calculate_grade(total_score)
46-
46+
4747
return {
4848
"total_score": round(total_score, 1),
49+
"success": True,
4950
"grade": grade,
5051
"category_scores": category_scores,
5152
"feedback": [score["feedback"] for score in category_scores.values()],
52-
"summary": self._generate_summary(total_score, grade)
53+
"summary": self._generate_summary(total_score, grade),
5354
}
54-
55+
5556
except Exception as e:
5657
return self._create_error_result(f"Error scoring policy: {str(e)}")
5758

@@ -70,7 +71,7 @@ def _parse_item(self, text: str) -> tuple[str, Any]:
7071
parts = text.split(':', 1)
7172
if len(parts) < 2:
7273
return None # Return None if we can't parse properly
73-
74+
7475
key = parts[0].strip()
7576
value = parts[1].strip()
7677
return (key, value)
@@ -81,33 +82,33 @@ def _score_category(self, category: str, policy_text: str) -> Dict[str, Any]:
8182
"""Score a specific category of the privacy policy"""
8283
raw_score = 0.0
8384
feedback = []
84-
85+
8586
try:
8687
# Get the criteria for this category
8788
category_criteria = self.criteria['categories'].get(category, [])
88-
89+
8990
if not category_criteria:
9091
return {
9192
"raw_score": 0.0,
9293
"weighted_score": 0.0,
9394
"feedback": [f"No criteria defined for category: {category}"]
9495
}
95-
96+
9697
for criterion in category_criteria:
9798
criterion_data = self.criteria['criteria'].get(criterion, {})
9899
if not criterion_data:
99100
continue
100-
101+
101102
# Get phrases
102103
required_phrases = criterion_data.get('required_phrases', [])
103104
matching_phrases = criterion_data.get('matching_phrases', [])
104-
105+
105106
# Check phrases
106107
required_found, found_required = self._check_phrases(policy_text, required_phrases)
107108
matching_found, found_matching = self._check_phrases(policy_text, matching_phrases)
108-
109+
109110
points = float(criterion_data.get('points', 0))
110-
111+
111112
if required_found:
112113
if matching_found:
113114
raw_score += points
@@ -128,7 +129,7 @@ def _score_category(self, category: str, policy_text: str) -> Dict[str, Any]:
128129
"weighted_score": round(weighted_score, 2),
129130
"feedback": feedback
130131
}
131-
132+
132133
except Exception as e:
133134
return {
134135
"raw_score": 0.0,
@@ -152,11 +153,11 @@ def _calculate_grade(self, score: float) -> str:
152153
def _generate_detailed_feedback(self, category_scores: Dict[str, Any]) -> List[str]:
153154
"""Generate detailed feedback based on category scores and findings"""
154155
detailed_feedback = []
155-
156+
156157
for category, scores in category_scores.items():
157158
score = scores['raw_score']
158159
category_name = category.replace('_', ' ').title()
159-
160+
160161
if score < 50:
161162
detailed_feedback.append(
162163
f"Critical: {category_name} needs significant improvement. "
@@ -193,14 +194,16 @@ def _create_error_result(self, error_message: str) -> Dict:
193194
"""Create an error result dictionary"""
194195
return {
195196
"total_score": 0.0,
197+
"success": False,
196198
"grade": "F",
197199
"category_scores": {
198200
category: {
199201
"raw_score": 0.0,
200202
"weighted_score": 0.0,
201-
"feedback": [error_message]
202-
} for category in self.criteria['categories'].keys()
203+
"feedback": [error_message],
204+
}
205+
for category in self.criteria["categories"].keys()
203206
},
204207
"feedback": [error_message],
205-
"summary": "Unable to evaluate privacy policy"
208+
"summary": "Unable to evaluate privacy policy",
206209
}

0 commit comments

Comments
 (0)