🎉 initial commit of the pii detector based on a hf sequence classification model

m-misiura · m-misiura · commit 99cae023535f · 2024-08-27T14:27:47.000+01:00
diff --git a/detectors/Dockerfile.pii-transformer b/detectors/Dockerfile.pii-transformer
@@ -0,0 +1,32 @@
+FROM registry.access.redhat.com/ubi9/ubi-minimal as base
+RUN microdnf update -y && \
+    microdnf install -y --nodocs \
+        python-pip python-devel && \
+    pip install --upgrade --no-cache-dir pip wheel && \
+    microdnf clean all
+RUN pip install --no-cache-dir torch
+
+# FROM icr.io/fm-stack/ubi9-minimal-py39-torch as builder
+FROM base as builder
+
+COPY ./common/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY ./pii_transformer/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+FROM builder
+
+WORKDIR /app
+
+COPY ./common /common
+COPY ./pii_transformer/app.py /app
+COPY ./pii_transformer/detector.py /app
+COPY ./pii_transformer/scheme.py /app
+
+ENV PII_MODEL_PATH "h2oai/deberta_finetuned_pii"
+
+EXPOSE 8000
+CMD ["uvicorn", "app:app", "--workers", "4", "--host", "0.0.0.0", "--port", "8000", "--log-config", "/common/log_conf.yaml"]
+
+# gunicorn main:app --workers 4 --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000
diff --git a/detectors/pii_transformer/__init__.py b/detectors/pii_transformer/__init__.py
diff --git a/detectors/pii_transformer/app.py b/detectors/pii_transformer/app.py
@@ -0,0 +1,46 @@
+import os
+import sys
+from contextlib import asynccontextmanager
+from typing import Annotated
+
+from fastapi import Header
+
+sys.path.insert(0, os.path.abspath(".."))
+
+from common.app import DetectorBaseAPI as FastAPI
+from detector import Detector
+from scheme import (
+    ContentAnalysisHttpRequest,
+    ContentsAnalysisResponse,
+    Error,
+)
+
+detector_objects = {}
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    detector_objects["detector"] = Detector()
+    yield
+    # Clean up the ML models and release the resources
+    detector_objects.clear()
+
+
+app = FastAPI(lifespan=lifespan, dependencies=[])
+
+
+@app.post(
+    "/api/v1/text/contents",
+    response_model=ContentsAnalysisResponse,
+    description="""Detectors that work on content text, be it user prompt or generated text. \
+                    Generally classification type detectors qualify for this. <br>""",
+    responses={
+        404: {"model": Error, "description": "Resource Not Found"},
+        422: {"model": Error, "description": "Validation Error"},
+    },
+)
+async def detector_unary_handler(
+    request: ContentAnalysisHttpRequest,
+    detector_id: Annotated[str, Header(example="en_syntax_slate.38m.hap")],
+):
+    return ContentsAnalysisResponse(root=detector_objects["detector"].run(request))
diff --git a/detectors/pii_transformer/detector.py b/detectors/pii_transformer/detector.py
@@ -0,0 +1,92 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(".."))
+# from common.scheme import TextDetectionHttpRequest, TextDetectionResponse
+
+import torch.nn
+from common.app import logger
+from scheme import (
+    ContentAnalysisHttpRequest,
+    ContentAnalysisResponse,
+    ContentsAnalysisResponse,
+)
+
+# Detector imports
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+
+
+class Detector:
+    def __init__(self):
+        # initialize the detector
+        model_files_path = os.environ.get("PII_MODEL_PATH")
+        logger.info(model_files_path)
+        # The tokenizer is going to be using the data on the CPU
+        self.tokenizer = AutoTokenizer.from_pretrained(model_files_path, use_fast=True)
+        self.model = AutoModelForTokenClassification.from_pretrained(
+            pretrained_model_name_or_path=model_files_path,
+        )
+
+        logger.info("torch.cuda".upper() + " " + str(torch.cuda.is_available()))
+
+        self.cuda_device = None
+
+        if torch.cuda.is_available():
+            # transparently taking a cuda gpu for an actor
+            self.cuda_device = torch.device("cuda")
+            torch.cuda.empty_cache()
+            self.model.to(self.cuda_device)
+            # self.tokenizer.to(self.cuda_device)
+            # AttributeError: 'RobertaTokenizerFast' object has no attribute 'to'
+            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+            logger.info("cuda_device".upper() + " " + str(self.cuda_device))
+
+    def run(self, input: ContentAnalysisHttpRequest) -> ContentsAnalysisResponse:
+        # run the classification for each entry on contents array
+        # logger.info(tokenizer_parameters)
+        contents_analyses = []
+        for text in input.contents:
+            content_analyses = []
+            tokenized = self.tokenizer(
+                text,
+                max_length=len(text),
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+            )
+            if self.cuda_device:
+                logger.info("adding tokenized to CUDA")
+                # If we are using a GPU, the tokens need to be there.
+                tokenized = tokenized.to(self.cuda_device)
+                # print (tokenized)
+
+            # A BatchEncoding includes 'data', 'encodings', 'is_fast', and 'n_sequences'.
+            model_out = self.model(**tokenized)
+            # logger.info(model_out)
+            # return logits
+            logits = model_out.logits
+            # Get the class with the highest probability, and use the model’s id2label mapping to convert it to a text label list
+            predictions = torch.argmax(logits, dim=2)
+            predicted_token_class = [
+                self.model.config.id2label[p] for p in predictions[0].tolist()
+            ]
+            # check if predicted token class list contains elements other than 'O', if yes, then it is a PII
+            pii_indicator = any([True for p in predicted_token_class if p != "O"])
+
+            # # A List[float] seems like a sensible way to return this
+            # if hap_score >= input.parameters["threshold"]:
+            content_analyses.append(
+                ContentAnalysisResponse(
+                    start=0,
+                    end=len(text),
+                    detection="has_pii",
+                    detection_type="pii",
+                    pii_check=pii_indicator,
+                    text=text,
+                    predicted_token_class=predicted_token_class,
+                    evidences=[],
+                )
+            )
+            contents_analyses.append(content_analyses)
+
+        return contents_analyses
diff --git a/detectors/pii_transformer/locustfile.py b/detectors/pii_transformer/locustfile.py
@@ -0,0 +1,35 @@
+"""
+Content Warning: Contains potentially offensive text dealing with racism, misogyny, and violence. Examples of input prompts provided purely for the purposes of testing HAP (Hate, Abuse and Profanity) models.
+"""
+
+from locust import HttpUser, between, task
+
+
+class WebsiteUser(HttpUser):
+    wait_time = between(1, 5)
+
+    # def on_start(self):
+    #     self.client.post("/login", {
+    #         "username": "test_user",
+    #         "password": ""
+    #     })
+
+    @task
+    def docs(self):
+        self.client.get("/docs")
+
+    @task
+    def api(self):
+        self.client.get("/openapi.json")
+
+    @task
+    def pii(self):
+        self.client.post(
+            "/api/v1/text/contents?pii_transformer",
+            json={
+                "contents": [
+                    "My name is John Doe and my social security number is 123-45-6789."
+                ]
+            },
+            headers={"detector-id": "pii", "Content-Type": "application/json"},
+        )
diff --git a/detectors/pii_transformer/requirements.txt b/detectors/pii_transformer/requirements.txt
@@ -0,0 +1 @@
+transformers==4.43.4
diff --git a/detectors/pii_transformer/scheme.py b/detectors/pii_transformer/scheme.py
@@ -0,0 +1,64 @@
+from enum import Enum
+from typing import List, Optional
+
+from pydantic import BaseModel, Field, RootModel
+
+
+class Evidence(BaseModel):
+    source: str = Field(
+        title="Source",
+        example="https://en.wikipedia.org/wiki/IBM",
+        description="Source of the evidence, it can be url of the evidence etc",
+    )
+
+
+class EvidenceType(str, Enum):
+    url = "url"
+    title = "title"
+
+
+class EvidenceObj(BaseModel):
+    type: EvidenceType = Field(
+        title="EvidenceType",
+        example="url",
+        description="Type field signifying the type of evidence provided. Example url, title etc",
+    )
+    evidence: Evidence = Field(
+        description="Evidence object, currently only containing source, but in future can contain other optional arguments like id, etc",
+    )
+
+
+class ContentAnalysisHttpRequest(BaseModel):
+    contents: List[str] = Field(
+        min_length=1,
+        title="Contents",
+        description="Field allowing users to provide list of texts for analysis. Note, results of this endpoint will contain analysis / detection of each of the provided text in the order they are present in the contents object.",
+        example=[
+            "Martians are like crocodiles; the more you give them meat, the more they want"
+        ],
+    )
+
+
+class ContentAnalysisResponse(BaseModel):
+    start: int = Field(example=14)
+    end: int = Field(example=26)
+    detection: str = Field(example="has_pii")
+    detection_type: str = Field(example="pii")
+    pii_check: bool = Field(example=True)
+    text: str = Field(example="My favourite dish is pierogi")
+    predicted_token_class: List[str] = Field(examples=["O", "O", "O", "O", "O"])
+    evidences: Optional[List[EvidenceObj]] = Field(
+        description="Optional field providing evidences for the provided detection",
+        default=[],
+    )
+
+
+class ContentsAnalysisResponse(RootModel):
+    root: List[List[ContentAnalysisResponse]] = Field(
+        title="Response Text Content Analysis Unary Handler Api V1 Text Content Post"
+    )
+
+
+class Error(BaseModel):
+    code: int
+    message: str