-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlabel_parser.py
More file actions
67 lines (53 loc) · 2.31 KB
/
label_parser.py
File metadata and controls
67 lines (53 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
Project: Images Dataset Collector
File: label_parser.py
Description: Utilities for parsing the input keywords JSON file. Extracts labels, categories,
and thresholds for the collection process.
"""
import json
from pathlib import Path
from typing import Optional, List, Dict, Any
from config import KEYWORDS_JSON
def get_all_labels(json_path: Optional[Path] = None) -> List[Dict[str, Any]]:
"""
Parses the configuration JSON to extract all available labels.
Args:
json_path: Path to the forensic_keywords.json file using 'categories' and 'keywords' schema.
Defaults to config.KEYWORDS_JSON.
Returns:
List of dictionaries containing 'text', 'category', and 'threshold' for each label.
"""
if json_path is None:
json_path = KEYWORDS_JSON
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"Error: Keywords file not found at {json_path}")
return []
default_threshold = data.get("default_threshold", 0.5)
labels = []
# Iterate through categories to flatten the label list
for category in data.get("categories", []):
category_name = category.get("name", "unknown")
for keyword in category.get("keywords", []):
labels.append({
"text": keyword.get("text", ""),
"category": category_name,
"threshold": keyword.get("threshold", default_threshold)
})
return labels
def get_labels_by_category(category_name: str, json_path: Optional[Path] = None) -> List[Dict[str, Any]]:
"""Retrieves all labels belonging to a specific category (case-insensitive)."""
all_labels = get_all_labels(json_path)
return [l for l in all_labels if l["category"].lower() == category_name.lower()]
def get_category_names(json_path: Optional[Path] = None) -> List[str]:
"""Retrieves a list of all unique category names defined in the configuration."""
if json_path is None:
json_path = KEYWORDS_JSON
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return [cat.get("name", "") for cat in data.get("categories", [])]
except FileNotFoundError:
return []