Skip to content

Commit 7133fce

Browse files
committed
Address comments
1 parent 50f1c3e commit 7133fce

File tree

5 files changed

+153
-16
lines changed

5 files changed

+153
-16
lines changed

detectors/built_in/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def detect_content(request: ContentAnalysisHttpRequest):
2727
try:
2828
message_detections += detector_registry.handle_request(content, request.detector_params)
2929
except Exception as e:
30-
raise HTTPException(status_code=500, detail=str(e))
30+
raise HTTPException(status_code=500) from e
3131
detections.append(message_detections)
3232
return ContentsAnalysisResponse(root=detections)
3333

detectors/built_in/file_type_detectors.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,10 @@ def __init__(self):
176176

177177
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
178178
detections = []
179-
if "file_type" in detector_params and isinstance(detector_params["file_type"], list):
180-
for file_type in detector_params["file_type"]:
179+
if "file_type" in detector_params and isinstance(detector_params["file_type"], (list, str)):
180+
file_types = detector_params["file_type"]
181+
file_types = [file_types] if isinstance(file_types, str) else file_types
182+
for file_type in file_types:
181183
if file_type.startswith("json-with-schema"):
182184
result = is_valid_json_schema(content, file_type.split("json-with-schema:")[1])
183185
if result is not None:

detectors/built_in/regex_detectors.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,57 @@ def email_address_detector(string: str) -> List[ContentAnalysisResponse]:
1111
return get_regex_detections(string, pattern, "pii", "email_address")
1212

1313
def credit_card_detector(string: str) -> List[ContentAnalysisResponse]:
14-
"""Detect credit cards in the text contents"""
15-
pattern = r"\b(?:4\d{3}|5[0-5]\d{2}|6\d{3}|1\d{3}|3\d{3})[- ]\d{4}[- ]\d{4}[- ]\d{4}\b"
16-
return get_regex_detections(string, pattern, "pii", "credit_card")
14+
"""Detect credit cards in the text contents (Visa, MasterCard, Amex, Discover, Diners Club, JCB) with Luhn check"""
15+
# Match major card types with separators (space or dash) between groups, not continuous digits
16+
pattern = (
17+
r"\b(?:"
18+
r"4\d{3}([- ])\d{4}\1\d{4}\1\d{4}" # Visa 16-digit with separators
19+
r"|4\d{15}" # Visa 16-digit continuous
20+
r"|5[1-5]\d{2}([- ])\d{4}\2\d{4}\2\d{4}" # MasterCard 16-digit with separators
21+
r"|5[1-5]\d{14}" # MasterCard 16-digit continuous
22+
r"|3[47]\d{2}([- ])\d{6}\3\d{5}" # Amex 15-digit with separators
23+
r"|3[47]\d{13}" # Amex 15-digit continuous
24+
r"|6(?:011|5\d{2})([- ])\d{4}\4\d{4}\4\d{4}" # Discover 16-digit with separators
25+
r"|6(?:011|5\d{2})\d{12}" # Discover 16-digit continuous
26+
r"|3(?:0[0-5]|[68]\d)\d([- ])\d{6}\5\d{4}" # Diners Club 14-digit with separators
27+
r"|3(?:0[0-5]|[68]\d)\d{11}" # Diners Club 14-digit continuous
28+
r"|35\d{2}([- ])\d{4}\6\d{4}\6\d{4}" # JCB 16-digit with separators
29+
r"|35\d{14}" # JCB 16-digit continuous
30+
r")\b"
31+
)
32+
# Find all matches and filter with Luhn check
33+
detections = []
34+
for match in re.finditer(pattern, string):
35+
cc_number = match.group(0).replace(" ", "").replace("-", "")
36+
print(cc_number)
37+
if is_luhn_valid(cc_number):
38+
detections.append(
39+
ContentAnalysisResponse(
40+
start=match.start(),
41+
end=match.end(),
42+
text=match.group(0),
43+
detection_type="pii",
44+
detection="credit_card",
45+
score=1.0
46+
)
47+
)
48+
return detections
49+
50+
def luhn_checksum(card_number: str):
51+
card_number = "".join(c for c in card_number if c in "0123456789")
52+
def digits_of(n):
53+
return [int(d) for d in str(n)]
54+
digits = digits_of(card_number)
55+
odd_digits = digits[-1::-2]
56+
even_digits = digits[-2::-2]
57+
checksum = 0
58+
checksum += sum(odd_digits)
59+
for d in even_digits:
60+
checksum += sum(digits_of(d*2))
61+
return checksum % 10
62+
63+
def is_luhn_valid(card_number):
64+
return luhn_checksum(card_number) == 0
1765

1866

1967
def ipv4_detector(string: str) -> List[ContentAnalysisResponse]:
@@ -85,8 +133,10 @@ def __init__(self):
85133

86134
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
87135
detections = []
88-
if "regex" in detector_params and isinstance(detector_params["regex"], list):
89-
for regex in detector_params["regex"]:
136+
if "regex" in detector_params and isinstance(detector_params["regex"], (list, str)):
137+
regexes = detector_params["regex"]
138+
regexes = [regexes] if isinstance(regexes, str) else regexes
139+
for regex in regexes:
90140
if regex == "$CUSTOM_REGEX":
91141
pass
92142
elif regex in self.registry:

tests/detectors/builtIn/test_filetype.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ def test_json_schema_invalid(self, client: TestClient, jsonschema):
7878
assert detections[0]["detection"] == "json_schema_mismatch"
7979

8080
def test_json_schema_invalid_json(self, client: TestClient, jsonschema):
81-
import json
8281
payload = {
8382
"contents": ['{a: 1}'],
8483
"detector_params": {"file_type": [f"json-with-schema:{jsonschema}"]}
@@ -91,9 +90,7 @@ def test_json_schema_invalid_json(self, client: TestClient, jsonschema):
9190

9291
def test_json_schema_invalid_json_schema(self, client: TestClient):
9392
# The schema expects an object with a required integer property "a"
94-
invalid_schema = {
95-
"notvalidjson: {"
96-
}
93+
invalid_schema = '{"notvalidjson": {'
9794
payload = {
9895
"contents": [json.dumps({"a": 1})],
9996
"detector_params": {"file_type": [f"json-with-schema:{invalid_schema}"]}
@@ -218,6 +215,17 @@ def test_detect_content_unrecognized_filetype(self, client: TestClient):
218215
assert "message" in data
219216
assert "Unrecognized file type" in data["message"]
220217

218+
def test_detect_content_single_filetype(self, client: TestClient):
219+
payload = {
220+
"contents": ['{a: 1, b: 2}'],
221+
"detector_params": {"file_type": "json"}
222+
}
223+
resp = client.post("/api/v1/text/contents", json=payload)
224+
assert resp.status_code == 200
225+
detections = resp.json()[0]
226+
assert detections[0]["detection"] == "invalid_json"
227+
228+
221229
def test_multiple_filetype_valid_and_invalid(self, client: TestClient):
222230
import json
223231
schema = {

tests/detectors/builtIn/test_regex.py

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ def client(self):
1111
"regex,content,expected",
1212
[
1313
("email", "Contact me at [email protected]", "[email protected]"),
14-
("credit-card", "Card: 4111-1111-1111-1111", "4111-1111-1111-1111"),
15-
("credit-card", "Card: 4111 1111 1111 1111", "4111 1111 1111 1111"),
1614
("ipv4", "My IP is 192.168.1.1", "192.168.1.1"),
1715
("us-social-security-number", "SSN: 123-45-6789", "123-45-6789"),
1816
("us-social-security-number", "SSN: 123 45 6789", "123 45 6789"),
@@ -37,8 +35,7 @@ def test_builtin_regex_detectors(self, client, regex, content, expected):
3735
@pytest.mark.parametrize(
3836
"regex,content",
3937
[
40-
("email", "Contact me at [email protected] "),
41-
("credit-card", "Card: 4111111111111111"),
38+
("email", "Contact me at [email protected] "), # invalid luhn card
4239
("us-social-security-number", "SSN: 123456789"),
4340
("us-phone-number", "Call 1234567890"),
4441
]
@@ -52,6 +49,73 @@ def test_builtin_regex_detectors_should_not_match(self, client, regex, content):
5249
assert resp.status_code == 200
5350
assert resp.json()[0] == []
5451

52+
@pytest.mark.parametrize(
53+
"content,expected",
54+
[
55+
# Visa
56+
("Card: 4111 1111 1111 1111", "4111 1111 1111 1111"),
57+
("Card: 4111-1111-1111-1111", "4111-1111-1111-1111"),
58+
("Card: 4111111111111111", "4111111111111111"),
59+
# MasterCard
60+
("Card: 5555 5555 5555 4444", "5555 5555 5555 4444"),
61+
("Card: 5555-5555-5555-4444", "5555-5555-5555-4444"),
62+
("Card: 5555555555554444", "5555555555554444"),
63+
# Amex
64+
("Card: 3782 822463 10005", "3782 822463 10005"),
65+
("Card: 3782-822463-10005", "3782-822463-10005"),
66+
("Card: 378282246310005", "378282246310005"),
67+
# Discover
68+
("Card: 6011 1111 1111 1117", "6011 1111 1111 1117"),
69+
("Card: 6011-1111-1111-1117", "6011-1111-1111-1117"),
70+
("Card: 6011111111111117", "6011111111111117"),
71+
# Diners Club
72+
("Card: 3056 930902 5904", "3056 930902 5904"),
73+
("Card: 3056-930902-5904", "3056-930902-5904"),
74+
("Card: 30569309025904", "30569309025904"),
75+
# JCB
76+
("Card: 3530 1113 3330 0000", "3530 1113 3330 0000"),
77+
("Card: 3530-1113-3330-0000", "3530-1113-3330-0000"),
78+
("Card: 3530111333300000", "3530111333300000"),
79+
]
80+
)
81+
def test_credit_card_detector_patterns(self, client, content, expected):
82+
payload = {
83+
"contents": [content],
84+
"detector_params": {"regex": ["credit-card"]}
85+
}
86+
resp = client.post("/api/v1/text/contents", json=payload)
87+
assert resp.status_code == 200
88+
found = [d["text"] for d in resp.json()[0]]
89+
assert expected in found
90+
91+
@pytest.mark.parametrize(
92+
"content",
93+
[
94+
# Invalid Luhn
95+
"Card: 4111 1111 1111 1112",
96+
"Card: 5555-5555-5555-4440",
97+
"Card: 3782 822463 10006",
98+
"Card: 6011-1111-1111-1110",
99+
"Card: 3056-930902-5900",
100+
"Card: 3530-1113-3330-0001",
101+
# Not enough digits
102+
"Card: 4111 1111 1111",
103+
"Card: 5555-5555-5555",
104+
"Card: 3782 822463",
105+
"Card: 6011-1111-1111",
106+
"Card: 3056-930902",
107+
"Card: 3530-1113-3330",
108+
]
109+
)
110+
def test_credit_card_detector_invalid(self, client, content):
111+
payload = {
112+
"contents": [content],
113+
"detector_params": {"regex": ["credit-card"]}
114+
}
115+
resp = client.post("/api/v1/text/contents", json=payload)
116+
assert resp.status_code == 200
117+
assert resp.json()[0] == []
118+
55119

56120
def test_multiple_regexes(self, client):
57121
payload = {
@@ -116,5 +180,18 @@ def test_multiple_contents(self, client):
116180
assert any("[email protected]" in d["text"] for d in results[0])
117181
assert any("123-45-6789" in d["text"] for d in results[1])
118182

183+
def test_single_detector(self, client):
184+
payload = {
185+
"contents": [
186+
187+
"SSN: 123-45-6789"
188+
],
189+
"detector_params": {"regex": "email"}
190+
}
191+
resp = client.post("/api/v1/text/contents", json=payload)
192+
assert resp.status_code == 200
193+
results = resp.json()
194+
assert any("[email protected]" in d["text"] for d in results[0])
195+
119196

120197

0 commit comments

Comments
 (0)