Skip to content

Commit 1b44c64

Browse files
committed
Migrate built-in detectors to python, add file-validation detectors
1 parent ff88f67 commit 1b44c64

File tree

12 files changed

+786
-2
lines changed

12 files changed

+786
-2
lines changed

detectors/Dockerfile.builtIn

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM registry.access.redhat.com/ubi9/ubi-minimal as base
2+
RUN microdnf update -y && \
3+
microdnf install -y --nodocs \
4+
python-pip python-devel && \
5+
pip install --upgrade --no-cache-dir pip wheel && \
6+
microdnf clean all
7+
8+
# FROM icr.io/fm-stack/ubi9-minimal-py39-torch as builder
9+
FROM base as builder
10+
11+
COPY ./common/requirements.txt .
12+
RUN pip install --no-cache-dir -r requirements.txt
13+
14+
COPY ./built_in/requirements.txt .
15+
RUN pip install --no-cache-dir -r requirements.txt
16+
17+
FROM builder
18+
19+
WORKDIR /app
20+
ARG CACHEBUST=1
21+
RUN echo "$CACHEBUST"
22+
COPY ./common /app/detectors/common
23+
COPY ./built_in/* /app
24+
25+
EXPOSE 8080
26+
CMD ["uvicorn", "app:app", "--workers", "4", "--host", "0.0.0.0", "--port", "8080", "--log-config", "/app/detectors/common/log_conf.yaml"]

detectors/built_in/app.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from fastapi import HTTPException
2+
3+
from base_detector_registry import BaseDetectorRegistry
4+
from regex_detectors import RegexDetectorRegistry
5+
from file_type_detectors import FileTypeDetectorRegistry
6+
7+
from prometheus_fastapi_instrumentator import Instrumentator
8+
from detectors.common.scheme import ContentAnalysisHttpRequest, ContentsAnalysisResponse
9+
from detectors.common.app import DetectorBaseAPI as FastAPI
10+
11+
app = FastAPI()
12+
Instrumentator().instrument(app).expose(app)
13+
14+
15+
registry : dict[str, BaseDetectorRegistry] = {
16+
"regex": RegexDetectorRegistry(),
17+
"file_type": FileTypeDetectorRegistry(),
18+
}
19+
20+
@app.post("/api/v1/text/contents", response_model=ContentsAnalysisResponse)
21+
def detect_content(request: ContentAnalysisHttpRequest):
22+
detections = []
23+
for content in request.contents:
24+
message_detections = []
25+
for detector_kind, detector_registry in registry.items():
26+
if detector_kind in request.detector_params:
27+
try:
28+
message_detections += detector_registry.handle_request(content, request.detector_params)
29+
except Exception as e:
30+
raise HTTPException(status_code=500, detail=str(e))
31+
detections.append(message_detections)
32+
return ContentsAnalysisResponse(root=detections)
33+
34+
35+
@app.get("/registry")
36+
def get_registry():
37+
result = {}
38+
for detector_type, detector_registry in registry.items():
39+
result[detector_type] = {}
40+
for detector_name, detector_fn in detector_registry.get_registry().items():
41+
result[detector_type][detector_name] = detector_fn.__doc__
42+
return result
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from abc import ABC, abstractmethod
2+
from typing import List
3+
4+
from detectors.common.scheme import ContentAnalysisResponse
5+
6+
class BaseDetectorRegistry(ABC):
7+
def __init__(self):
8+
self.registry = None
9+
10+
@abstractmethod
11+
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
12+
pass
13+
14+
def get_registry(self):
15+
return self.registry
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import json
2+
import jsonschema
3+
import xml.etree.ElementTree as ET
4+
import xmlschema
5+
import yaml
6+
7+
from typing import List, Optional
8+
9+
from base_detector_registry import BaseDetectorRegistry
10+
from detectors.common.scheme import ContentAnalysisResponse
11+
12+
13+
def is_valid_json(s: str) -> Optional[ContentAnalysisResponse]:
14+
"""Detect if the text contents is not valid JSON"""
15+
try:
16+
json.loads(s)
17+
return None
18+
except (ValueError, TypeError):
19+
return ContentAnalysisResponse(
20+
start=0,
21+
end=len(s),
22+
text=s,
23+
detection="invalid_json",
24+
detection_type= "file_type",
25+
score=1.0
26+
)
27+
28+
29+
def is_valid_json_schema(s: str, schema: str) -> Optional[ContentAnalysisResponse]:
30+
"""Detect if the text contents does not satisfy a provided JSON schema. To specify a schema, replace $SCHEMA with a JSON schema."""
31+
is_valid = is_valid_json(s)
32+
if is_valid is None:
33+
msg_data = json.loads(s)
34+
else:
35+
return is_valid
36+
37+
38+
# validate that the schema is valid json
39+
try:
40+
schema_data = json.loads(schema)
41+
except (ValueError, TypeError):
42+
return ContentAnalysisResponse(
43+
start=0,
44+
end=len(schema),
45+
text=s,
46+
detection="invalid_schema",
47+
detection_type="file_type",
48+
score=1.0
49+
)
50+
51+
# validate the schema against the message
52+
try:
53+
jsonschema.validate(instance=msg_data, schema=schema_data)
54+
return None
55+
except jsonschema.ValidationError as e:
56+
return ContentAnalysisResponse(
57+
start=0,
58+
end=len(s),
59+
text=s,
60+
detection="json_schema_mismatch",
61+
detection_type="file_type",
62+
score=1.0
63+
)
64+
65+
66+
def is_valid_yaml(s: str) -> Optional[ContentAnalysisResponse]:
67+
"""Detect if the text contents is not valid YAML"""
68+
try:
69+
yaml.safe_load(s)
70+
return None
71+
except Exception:
72+
return ContentAnalysisResponse(
73+
start=0,
74+
end=len(s),
75+
text=s,
76+
detection="invalid_yaml",
77+
detection_type="file_type",
78+
score=1.0,
79+
)
80+
81+
def is_valid_yaml_schema(s: str, schema) -> Optional[ContentAnalysisResponse]:
82+
"""Detect if the text contents does not satisfy a provided schema. To specify a schema, replace $SCHEMA with a JSON schema. That's not a typo, you validate YAML with a JSON schema!"""
83+
is_valid = is_valid_yaml(s)
84+
if is_valid is None:
85+
msg_data = yaml.safe_load(s)
86+
else:
87+
return is_valid
88+
89+
# validate that the schema is valid json
90+
try:
91+
schema_data = json.loads(schema)
92+
except (ValueError, TypeError):
93+
return ContentAnalysisResponse(
94+
start=0,
95+
end=len(schema),
96+
text=s,
97+
detection="invalid_schema",
98+
detection_type="file_type",
99+
score=1.0
100+
)
101+
102+
# validate the schema against the message
103+
try:
104+
jsonschema.validate(instance=msg_data, schema=schema_data)
105+
return None
106+
except jsonschema.ValidationError as e:
107+
return ContentAnalysisResponse(
108+
start=0,
109+
end=len(s),
110+
text=s,
111+
detection="yaml_schema_mismatch",
112+
detection_type="file_type",
113+
score=1.0
114+
)
115+
116+
117+
def is_valid_xml(s: str) -> Optional[ContentAnalysisResponse]:
118+
"""Detect if the text contents is not valid XML"""
119+
try:
120+
ET.fromstring(s)
121+
return None
122+
except Exception:
123+
return ContentAnalysisResponse(
124+
start=0,
125+
end=len(s),
126+
text=s,
127+
detection="invalid_xml",
128+
detection_type="file_type",
129+
score=1.0,
130+
)
131+
132+
133+
def is_valid_xml_schema(s: str, schema) -> Optional[ContentAnalysisResponse]:
134+
"""Detect if the text contents does not satisfy a provided XML schema. To specify a schema, replace $SCHEMA with an XML Schema Definition (XSD)"""
135+
is_valid = is_valid_xml(s)
136+
if is_valid is not None:
137+
return is_valid
138+
try:
139+
# schema is expected to be a string containing the XSD
140+
xs = xmlschema.XMLSchema(schema)
141+
except Exception:
142+
return ContentAnalysisResponse(
143+
start=0,
144+
end=len(schema),
145+
text=s,
146+
detection="invalid_xml_schema",
147+
detection_type="file_type",
148+
score=1.0
149+
)
150+
151+
try:
152+
xs.validate(s)
153+
return None
154+
except xmlschema.XMLSchemaValidationError:
155+
return ContentAnalysisResponse(
156+
start=0,
157+
end=len(s),
158+
text=s,
159+
detection="xml_schema_mismatch",
160+
detection_type="file_type",
161+
score=1.0
162+
)
163+
164+
165+
166+
class FileTypeDetectorRegistry(BaseDetectorRegistry):
167+
def __init__(self):
168+
self.registry = {
169+
"json": is_valid_json,
170+
"xml": is_valid_xml,
171+
"yaml": is_valid_yaml,
172+
"json-with-schema:$SCHEMA": is_valid_json_schema,
173+
"xml-with-schema:$SCHEMA": is_valid_xml_schema,
174+
"yaml-with-schema:$SCHEMA": is_valid_yaml_schema,
175+
}
176+
177+
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
178+
detections = []
179+
if "file_type" in detector_params and isinstance(detector_params["file_type"], list):
180+
for file_type in detector_params["file_type"]:
181+
if file_type.startswith("json-with-schema"):
182+
result = is_valid_json_schema(content, file_type.split("json-with-schema:")[1])
183+
if result is not None:
184+
detections += [result]
185+
elif file_type.startswith("yaml-with-schema"):
186+
result = is_valid_yaml_schema(content, file_type.split("yaml-with-schema:")[1])
187+
if result is not None:
188+
detections += [result]
189+
elif file_type.startswith("xml-with-schema"):
190+
result = is_valid_xml_schema(content, file_type.split("xml-with-schema:")[1])
191+
if result is not None:
192+
detections += [result]
193+
elif file_type in self.registry:
194+
result = self.registry[file_type](content)
195+
if result is not None:
196+
detections += [result]
197+
else:
198+
raise ValueError(f"Unrecognized file type: {file_type}")
199+
return detections

detectors/built_in/regex_detectors.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import re
2+
from http.client import HTTPException
3+
from typing import List
4+
from base_detector_registry import BaseDetectorRegistry
5+
from detectors.common.scheme import ContentAnalysisResponse
6+
7+
8+
def email_address_detector(string: str) -> List[ContentAnalysisResponse]:
9+
"""Detect email addresses in the text contents"""
10+
pattern = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
11+
return get_regex_detections(string, pattern, "pii", "email_address")
12+
13+
def credit_card_detector(string: str) -> List[ContentAnalysisResponse]:
14+
"""Detect credit cards in the text contents"""
15+
pattern = r"\b(?:4\d{3}|5[0-5]\d{2}|6\d{3}|1\d{3}|3\d{3})[- ]\d{4}[- ]\d{4}[- ]\d{4}\b"
16+
return get_regex_detections(string, pattern, "pii", "credit_card")
17+
18+
19+
def ipv4_detector(string: str) -> List[ContentAnalysisResponse]:
20+
"""Detect IPv4 addresses in the text contents"""
21+
pattern = re.compile(
22+
u"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
23+
re.IGNORECASE,
24+
)
25+
return get_regex_detections(string, pattern, "pii", "ipv4")
26+
27+
def ipv6_detector(string: str) -> List[ContentAnalysisResponse]:
28+
"""Detect IPv6 addresses in the text contents"""
29+
pattern = re.compile(
30+
u"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",
31+
re.VERBOSE | re.IGNORECASE | re.DOTALL,
32+
)
33+
return get_regex_detections(string, pattern, "pii", "ipv6")
34+
35+
# === USA Specific =================================================================================
36+
def ssn_detector(string: str) -> List[ContentAnalysisResponse]:
37+
"""Detect social security numbers in the text contents"""
38+
pattern = r"\b\d{3}[- ]\d{2}[- ]\d{4}\b"
39+
return get_regex_detections(string, pattern, "pii", "social_security_number")
40+
41+
def us_phone_number_detector(string: str) -> List[ContentAnalysisResponse]:
42+
"""Detect US phone numbers in the text contents"""
43+
pattern = r"(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]+\d{3}[-.\s]?\d{4}\b"
44+
return get_regex_detections(string, pattern, "pii", "us-phone-number")
45+
46+
# === UK Specific =================================================================================
47+
def uk_post_code_detector(string: str) -> List[ContentAnalysisResponse]:
48+
"""Detect UK post codes in the text contents"""
49+
pattern = r"\b([A-Z]{1,2}[0-9][0-9A-Z]? ?[0-9][A-Z]{2})\b"
50+
return get_regex_detections(string, pattern, "pii", "uk-post-code")
51+
52+
53+
def get_regex_detections(string, pattern, detection_type, detection) -> List[ContentAnalysisResponse]:
54+
detections = []
55+
for match in re.finditer(pattern, string):
56+
detections.append(
57+
ContentAnalysisResponse(
58+
start=match.start(),
59+
end=match.end(),
60+
text=match.string[match.start():match.end()],
61+
detection_type=detection_type,
62+
detection=detection,
63+
score=1.0
64+
))
65+
return detections
66+
67+
# dummy function to add documention on the custom regex detector to the registr
68+
def custom_regex_documenter():
69+
"""Replace $CUSTOM_REGEX with a custom regex to define your own regex detector"""
70+
71+
72+
# === ROUTER =======================================================================================
73+
class RegexDetectorRegistry(BaseDetectorRegistry):
74+
def __init__(self):
75+
self.registry = {
76+
"credit-card": credit_card_detector,
77+
"email": email_address_detector,
78+
"ipv4": ipv4_detector,
79+
"ipv6": ipv6_detector,
80+
"us-phone-number": us_phone_number_detector,
81+
"us-social-security-number": ssn_detector,
82+
"uk-post-code": uk_post_code_detector,
83+
"$CUSTOM_REGEX": custom_regex_documenter,
84+
}
85+
86+
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
87+
detections = []
88+
if "regex" in detector_params and isinstance(detector_params["regex"], list):
89+
for regex in detector_params["regex"]:
90+
if regex == "$CUSTOM_REGEX":
91+
pass
92+
elif regex in self.registry:
93+
detections += self.registry[regex](content)
94+
else:
95+
detections += get_regex_detections(content, regex, "regex", "custom-regex")
96+
return detections

detectors/built_in/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
markdown==3.8.2
2+
jsonschema==4.24.0
3+
xmlschema==4.1.0

detectors/common/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def main(app):
107107
config = {
108108
"server": {
109109
"host": "0.0.0.0",
110-
"port": "8000",
110+
"port": "8080",
111111
"workers": 1,
112112
"limit_concurrency": 1000,
113113
"timeout_keep_alive": 30,

0 commit comments

Comments
 (0)