Skip to content

Commit db210ad

Browse files
Merge pull request #27 from trustyai-explainability/builtInDetectors
FEAT: Migrate built-in detectors to python, add file-validation detectors
2 parents 013afef + 83c766d commit db210ad

File tree

13 files changed

+954
-4
lines changed

13 files changed

+954
-4
lines changed

.github/workflows/build-and-push-hf.yaml

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,34 +55,44 @@ jobs:
5555
echo "SHA: ${{ github.event.pull_request.head.sha }}"
5656
echo "MAIN IMAGE AT: ${{ vars.QUAY_RELEASE_REPO }}:latest"
5757
echo "CI IMAGE AT: quay.io/trustyai/guardrails-detector-huggingface-runtime-ci:${{ github.event.pull_request.head.sha }}"
58+
echo "Built-In Detector CI IMAGE AT: quay.io/trustyai/regex-detector-ci:${{ github.event.pull_request.head.sha }}"
5859
5960
# Set environments depending on context
6061
- name: Set CI environment
6162
if: env.BUILD_CONTEXT == 'ci'
6263
run: |
6364
echo "TAG=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
6465
echo "IMAGE_NAME=quay.io/trustyai/guardrails-detector-huggingface-runtime-ci" >> $GITHUB_ENV
66+
echo "BUILTIN_IMAGE_NAME=quay.io/trustyai/regex-detector-ci" >> $GITHUB_ENV
6567
- name: Set main-branch environment
6668
if: env.BUILD_CONTEXT == 'main'
6769
run: |
6870
echo "TAG=latest" >> $GITHUB_ENV
6971
echo "IMAGE_NAME=${{ vars.QUAY_RELEASE_REPO }}" >> $GITHUB_ENV
72+
echo "BUILTIN_IMAGE_NAME=quay.io/trustyai/regex-detector" >> $GITHUB_ENV
7073
- name: Set tag environment
7174
if: env.BUILD_CONTEXT == 'tag'
7275
run: |
7376
echo "TAG=${{ github.ref_name }}" >> $GITHUB_ENV
7477
echo "IMAGE_NAME=${{ vars.QUAY_RELEASE_REPO }}" >> $GITHUB_ENV
78+
echo "BUILTIN_IMAGE_NAME=quay.io/trustyai/regex-detector" >> $GITHUB_ENV
7579
#
7680
# Run docker commands
7781
- name: Put expiry date on CI-tagged image
7882
if: env.BUILD_CONTEXT == 'ci'
79-
run: echo 'LABEL quay.expires-after=7d#' >> detectors/Dockerfile.hf
83+
run: |
84+
echo 'LABEL quay.expires-after=7d#' >> detectors/Dockerfile.hf
85+
echo 'LABEL quay.expires-after=7d#' >> detectors/Dockerfile.builtIn
8086
- name: Build image
8187
run: docker build -t ${{ env.IMAGE_NAME }}:$TAG -f detectors/Dockerfile.hf detectors
8288
- name: Log in to Quay
8389
run: docker login -u ${{ secrets.QUAY_ROBOT_USERNAME }} -p ${{ secrets.QUAY_ROBOT_SECRET }} quay.io
8490
- name: Push to Quay CI repo
8591
run: docker push ${{ env.IMAGE_NAME }}:$TAG
92+
- name: Build built-in detector image
93+
run: docker build -t ${{ env.BUILTIN_IMAGE_NAME }}:$TAG -f detectors/Dockerfile.builtIn detectors
94+
- name: Push to Quay CI repo
95+
run: docker push ${{ env.BUILTIN_IMAGE_NAME }}:$TAG
8696

8797
# Leave comment
8898
- uses: peter-evans/find-comment@v3
@@ -104,6 +114,7 @@ jobs:
104114
PR image build completed successfully!
105115
106116
📦 [PR image](https://quay.io/repository/trustyai/guardrails-detector-huggingface-runtime-ci?tab=tags): `quay.io/trustyai/guardrails-detector-huggingface-runtime-ci:${{ github.event.pull_request.head.sha }}`
117+
📦 [PR image](https://quay.io/trustyai/regex-detector-ci?tab=tags): `quay.io/trustyai/regex-detector-ci:${{ github.event.pull_request.head.sha }}`
107118
- name: Trivy scan
108119
uses: aquasecurity/[email protected]
109120
with:
@@ -115,8 +126,22 @@ jobs:
115126
exit-code: '0'
116127
ignore-unfixed: false
117128
vuln-type: 'os,library'
118-
129+
- name: Trivy scan, built-in image
130+
uses: aquasecurity/[email protected]
131+
with:
132+
scan-type: 'image'
133+
image-ref: "${{ env.BUILTIN_IMAGE_NAME }}:${{ env.TAG }}"
134+
format: 'sarif'
135+
output: 'trivy-results-built-in.sarif'
136+
severity: 'MEDIUM,HIGH,CRITICAL'
137+
exit-code: '0'
138+
ignore-unfixed: false
139+
vuln-type: 'os,library'
119140
- name: Update Security tab
120141
uses: github/codeql-action/upload-sarif@v3
121142
with:
122143
sarif_file: 'trivy-results.sarif'
144+
- name: Update Security tab
145+
uses: github/codeql-action/upload-sarif@v3
146+
with:
147+
sarif_file: 'trivy-results-built-in.sarif'

detectors/Dockerfile.builtIn

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM registry.access.redhat.com/ubi9/ubi-minimal as base
2+
RUN microdnf update -y && \
3+
microdnf install -y --nodocs \
4+
python-pip python-devel && \
5+
pip install --upgrade --no-cache-dir pip wheel && \
6+
microdnf clean all
7+
8+
# FROM icr.io/fm-stack/ubi9-minimal-py39-torch as builder
9+
FROM base as builder
10+
11+
COPY ./common/requirements.txt .
12+
RUN pip install --no-cache-dir -r requirements.txt
13+
14+
COPY ./built_in/requirements.txt .
15+
RUN pip install --no-cache-dir -r requirements.txt
16+
17+
FROM builder
18+
19+
WORKDIR /app
20+
ARG CACHEBUST=1
21+
RUN echo "$CACHEBUST"
22+
COPY ./common /app/detectors/common
23+
COPY ./built_in/* /app
24+
25+
EXPOSE 8080
26+
CMD ["uvicorn", "app:app", "--workers", "4", "--host", "0.0.0.0", "--port", "8080", "--log-config", "/app/detectors/common/log_conf.yaml"]

detectors/built_in/app.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from fastapi import HTTPException
2+
3+
from base_detector_registry import BaseDetectorRegistry
4+
from regex_detectors import RegexDetectorRegistry
5+
from file_type_detectors import FileTypeDetectorRegistry
6+
7+
from prometheus_fastapi_instrumentator import Instrumentator
8+
from detectors.common.scheme import ContentAnalysisHttpRequest, ContentsAnalysisResponse
9+
from detectors.common.app import DetectorBaseAPI as FastAPI
10+
11+
app = FastAPI()
12+
Instrumentator().instrument(app).expose(app)
13+
14+
15+
registry : dict[str, BaseDetectorRegistry] = {
16+
"regex": RegexDetectorRegistry(),
17+
"file_type": FileTypeDetectorRegistry(),
18+
}
19+
20+
@app.post("/api/v1/text/contents", response_model=ContentsAnalysisResponse)
21+
def detect_content(request: ContentAnalysisHttpRequest):
22+
detections = []
23+
for content in request.contents:
24+
message_detections = []
25+
for detector_kind, detector_registry in registry.items():
26+
if detector_kind in request.detector_params:
27+
try:
28+
message_detections += detector_registry.handle_request(content, request.detector_params)
29+
except HTTPException as e:
30+
raise e
31+
except Exception as e:
32+
raise HTTPException(status_code=500) from e
33+
detections.append(message_detections)
34+
return ContentsAnalysisResponse(root=detections)
35+
36+
37+
@app.get("/registry")
38+
def get_registry():
39+
result = {}
40+
for detector_type, detector_registry in registry.items():
41+
result[detector_type] = {}
42+
for detector_name, detector_fn in detector_registry.get_registry().items():
43+
result[detector_type][detector_name] = detector_fn.__doc__
44+
return result
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from abc import ABC, abstractmethod
2+
from typing import List
3+
4+
from detectors.common.scheme import ContentAnalysisResponse
5+
6+
class BaseDetectorRegistry(ABC):
7+
def __init__(self):
8+
self.registry = None
9+
10+
@abstractmethod
11+
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
12+
pass
13+
14+
def get_registry(self):
15+
return self.registry
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import json
2+
from fastapi import HTTPException
3+
4+
import jsonschema
5+
import xml.etree.ElementTree as ET
6+
import xmlschema
7+
import yaml
8+
9+
from typing import List, Optional
10+
11+
from base_detector_registry import BaseDetectorRegistry
12+
from detectors.common.scheme import ContentAnalysisResponse
13+
14+
15+
def is_valid_json(s: str) -> Optional[ContentAnalysisResponse]:
16+
"""Detect if the text contents is not valid JSON"""
17+
try:
18+
json.loads(s)
19+
return None
20+
except (ValueError, TypeError):
21+
return ContentAnalysisResponse(
22+
start=0,
23+
end=len(s),
24+
text=s,
25+
detection="invalid_json",
26+
detection_type= "file_type",
27+
score=1.0
28+
)
29+
30+
31+
def is_valid_json_schema(s: str, schema: str) -> Optional[ContentAnalysisResponse]:
32+
"""Detect if the text contents does not satisfy a provided JSON schema. To specify a schema, replace $SCHEMA with a JSON schema."""
33+
is_valid = is_valid_json(s)
34+
if is_valid is None:
35+
msg_data = json.loads(s)
36+
else:
37+
return is_valid
38+
39+
40+
# validate that the schema is valid json
41+
try:
42+
schema_data = json.loads(schema)
43+
except (ValueError, TypeError):
44+
return ContentAnalysisResponse(
45+
start=0,
46+
end=len(schema),
47+
text=s,
48+
detection="invalid_schema",
49+
detection_type="file_type",
50+
score=1.0
51+
)
52+
53+
# validate the schema against the message
54+
try:
55+
jsonschema.validate(instance=msg_data, schema=schema_data)
56+
return None
57+
except jsonschema.ValidationError as e:
58+
return ContentAnalysisResponse(
59+
start=0,
60+
end=len(s),
61+
text=s,
62+
detection="json_schema_mismatch",
63+
detection_type="file_type",
64+
score=1.0
65+
)
66+
67+
68+
def is_valid_yaml(s: str) -> Optional[ContentAnalysisResponse]:
69+
"""Detect if the text contents is not valid YAML"""
70+
try:
71+
yaml.safe_load(s)
72+
return None
73+
except Exception:
74+
return ContentAnalysisResponse(
75+
start=0,
76+
end=len(s),
77+
text=s,
78+
detection="invalid_yaml",
79+
detection_type="file_type",
80+
score=1.0,
81+
)
82+
83+
def is_valid_yaml_schema(s: str, schema) -> Optional[ContentAnalysisResponse]:
84+
"""Detect if the text contents does not satisfy a provided schema. To specify a schema, replace $SCHEMA with a JSON schema. That's not a typo, you validate YAML with a JSON schema!"""
85+
is_valid = is_valid_yaml(s)
86+
if is_valid is None:
87+
msg_data = yaml.safe_load(s)
88+
else:
89+
return is_valid
90+
91+
# validate that the schema is valid json
92+
try:
93+
schema_data = json.loads(schema)
94+
except (ValueError, TypeError):
95+
return ContentAnalysisResponse(
96+
start=0,
97+
end=len(schema),
98+
text=s,
99+
detection="invalid_schema",
100+
detection_type="file_type",
101+
score=1.0
102+
)
103+
104+
# validate the schema against the message
105+
try:
106+
jsonschema.validate(instance=msg_data, schema=schema_data)
107+
return None
108+
except jsonschema.ValidationError as e:
109+
return ContentAnalysisResponse(
110+
start=0,
111+
end=len(s),
112+
text=s,
113+
detection="yaml_schema_mismatch",
114+
detection_type="file_type",
115+
score=1.0
116+
)
117+
118+
119+
def is_valid_xml(s: str) -> Optional[ContentAnalysisResponse]:
120+
"""Detect if the text contents is not valid XML"""
121+
try:
122+
ET.fromstring(s)
123+
return None
124+
except Exception:
125+
return ContentAnalysisResponse(
126+
start=0,
127+
end=len(s),
128+
text=s,
129+
detection="invalid_xml",
130+
detection_type="file_type",
131+
score=1.0,
132+
)
133+
134+
135+
def is_valid_xml_schema(s: str, schema) -> Optional[ContentAnalysisResponse]:
136+
"""Detect if the text contents does not satisfy a provided XML schema. To specify a schema, replace $SCHEMA with an XML Schema Definition (XSD)"""
137+
is_valid = is_valid_xml(s)
138+
if is_valid is not None:
139+
return is_valid
140+
try:
141+
# schema is expected to be a string containing the XSD
142+
xs = xmlschema.XMLSchema(schema)
143+
except Exception:
144+
return ContentAnalysisResponse(
145+
start=0,
146+
end=len(schema),
147+
text=s,
148+
detection="invalid_xml_schema",
149+
detection_type="file_type",
150+
score=1.0
151+
)
152+
153+
try:
154+
xs.validate(s)
155+
return None
156+
except xmlschema.XMLSchemaValidationError:
157+
return ContentAnalysisResponse(
158+
start=0,
159+
end=len(s),
160+
text=s,
161+
detection="xml_schema_mismatch",
162+
detection_type="file_type",
163+
score=1.0
164+
)
165+
166+
167+
168+
class FileTypeDetectorRegistry(BaseDetectorRegistry):
169+
def __init__(self):
170+
self.registry = {
171+
"json": is_valid_json,
172+
"xml": is_valid_xml,
173+
"yaml": is_valid_yaml,
174+
"json-with-schema:$SCHEMA": is_valid_json_schema,
175+
"xml-with-schema:$SCHEMA": is_valid_xml_schema,
176+
"yaml-with-schema:$SCHEMA": is_valid_yaml_schema,
177+
}
178+
179+
def handle_request(self, content: str, detector_params: dict) -> List[ContentAnalysisResponse]:
180+
detections = []
181+
if "file_type" in detector_params and isinstance(detector_params["file_type"], (list, str)):
182+
file_types = detector_params["file_type"]
183+
file_types = [file_types] if isinstance(file_types, str) else file_types
184+
for file_type in file_types:
185+
if file_type.startswith("json-with-schema"):
186+
result = is_valid_json_schema(content, file_type.split("json-with-schema:")[1])
187+
if result is not None:
188+
detections += [result]
189+
elif file_type.startswith("yaml-with-schema"):
190+
result = is_valid_yaml_schema(content, file_type.split("yaml-with-schema:")[1])
191+
if result is not None:
192+
detections += [result]
193+
elif file_type.startswith("xml-with-schema"):
194+
result = is_valid_xml_schema(content, file_type.split("xml-with-schema:")[1])
195+
if result is not None:
196+
detections += [result]
197+
elif file_type in self.registry:
198+
result = self.registry[file_type](content)
199+
if result is not None:
200+
detections += [result]
201+
else:
202+
raise HTTPException(status_code=400, detail=f"Unrecognized file type: {file_type}")
203+
return detections

0 commit comments

Comments
 (0)