jordanopensource · evilmooncake · Jul 30, 2023 · Jul 30, 2023 · Jul 30, 2023 · Jul 30, 2023
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,5 @@
+venv
+.*
+*.md
+LICENSE
+__pycache__
diff --git a/.drone.yml b/.drone.yml
@@ -0,0 +1,9 @@
+# Drone CI File!
+
+kind: template
+load: container.jsonnet
+data:
+  repositoryName: josaorg/nuha-api
+  releaseName: nuha-api
+  buildArgs:
+
diff --git a/.github/workflows/schedule-milestones.yaml b/.github/workflows/schedule-milestones.yaml
@@ -0,0 +1,24 @@
+name: schedule-milestones
+
+on:
+  schedule:
+    - cron: 0 0 * * SUN # Run every Sunday at midnight
+
+jobs:
+  generate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Schedule Milestones
+        uses: readmeio/[email protected]
+        id: scheduled
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: 'S-'
+          days: Thursday
+          count: 4
+          format: YYYY-MM-DD
+
+      - name: Created Milestones
+        run: echo ${{ steps.scheduled.outputs.milestones }}
@@ -8,2 +8,5 @@
  generate:
+    permissions:
+      contents: read
+      issues: write
    runs-on: ubuntu-latest
@@ -8,2 +8,5 @@
  generate:
+    permissions:
+      contents: read
+      issues: write
    runs-on: ubuntu-latest
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.10.6-slim AS builder
+
+WORKDIR /app
+
+RUN pip install --upgrade pip
+ADD requirements.txt /tmp
+RUN pip install -r /tmp/requirements.txt
+COPY . /app
+
+
+# Run stage
+FROM python:3.10.6-slim
+
+WORKDIR /app
+
+COPY --from=builder /usr/local/lib/python3.10/site-packages  /usr/local/lib/python3.10/site-packages
+COPY --from=builder /usr/local/bin/ /usr/local/bin/
+COPY --from=builder /app .
+
+ENTRYPOINT [ "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000" ]
+
diff --git a/README.md b/README.md
@@ -63,6 +63,11 @@ To get a local copy up and running follow these simple steps.
 
 ### Prerequisites
 
+This project depends on a trained text classification model, which is hosted on [Hugging Face](https://huggingface.co/). You can either train your own model or use the one provided by JOSA. The model is defined in environment variables, which are passed to the application at runtime:
+1. HUGGINGFACE_TOKEN: The Hugging Face API token.
+2. MODEL_PATH: The model path on Hugging Face.
+3. MODEL_VERSION: The model version on Hugging Face.
+
 ### Installation
 
 1. Clone the repo
@@ -71,21 +76,60 @@ To get a local copy up and running follow these simple steps.
    git clone https://github.com/jordanopensource/nuha-api.git
    ```
 
-2.
+2. Create a virtual environment
+
+   ```sh
+   python3 -m venv venv
+   ```
+
+3. Activate the virtual environment
+
+   ```sh
+    source venv/bin/activate
+    ```
+
+4. Install the dependencies
+
+   ```sh
+   pip install -r requirements.txt
+   ```
+
+
 
 ### Running
 
 #### Development
 
 To run the project locally for development purposes:
 
-1.
+1. Activate the virtual environment
+
+   ```sh
+    source venv/bin/activate
+    ```
+
+2. Run the project
+   ```sh
+    HUGGINGFACE_TOKEN="" MODEL_PATH="" MODEL_VERSION="" uvicorn app.main:app --reload
+    ```
 
 #### Production
 
 To build and run the project locally for production purposes:
 
-1.
+1. Build the Docker image
+
+   ```sh
+   docker build -t nuha-api .
+   ```
+
+2. Run the Docker container
+
+   ```sh
+    docker run -d -p 8000:8000 -e HUGGINGFACE_TOKEN="" -e MODEL_PATH="" -e MODEL_VERSION="" nuha-api
+    ```
+
+
 
 ___
 

diff --git a/main.py b/main.py
@@ -0,0 +1,49 @@
+"""Nuha API main module."""
+import os
+from fastapi import FastAPI
+from fastapi.requests import Request
+import huggingface_hub
+
+from src.interface import PredictionRequest, PredictionResponse
+from src.model import Nuha, PredictionResult
+
+app = FastAPI(
+    title="Nuha API",
+    description="API to serve ML model for hate-speech classification",
+)
+
+
+@app.on_event("startup")
+def on_startup():
+    """Load model on startup."""
+    model_path = os.environ.get("MODEL_PATH")
+    model_version = os.environ.get("MODEL_VERSION")
+    huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
+
+    huggingface_hub.login(token=huggingface_token)
+    app.state.model = Nuha(model_path=model_path, model_version=model_version)
+
+@app.get('/healthcheck')
+def healthcheck(request: Request):
+  return 'A healthy response'
+
+@app.post("/predict")
+def predict(
+    request: Request, comments: list[PredictionRequest]
+) -> list[PredictionResponse]:
+    """Classify comments into hatespeech or not."""
+    model = request.app.state.model
+
+    results: list[PredictionResult]
+    results = model.predict([c.comment for c in comments])
+
+    return [
+        {
+            "label": result.label,
+            "score": result.score,
+            "model_version": model.model_version,
+            "comment": comment.comment,
+            "post": comment.post,
+        }
+        for result, comment in zip(results, comments)
+    ]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,67 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+aiohttp==3.8.6
+aiosignal==1.3.1
+annotated-types==0.5.0
+anyio==3.7.1
+async-timeout==4.0.2
+attrs==23.1.0
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+coloredlogs==15.0.1
+datasets==2.14.4
+dill==0.3.7
+evaluate==0.4.0
+exceptiongroup==1.1.2
+fastapi==0.101.0
+filelock==3.9.0
+flatbuffers==23.5.26
+frozenlist==1.4.0
+fsspec==2023.6.0
+h11==0.14.0
+httptools==0.6.0
+huggingface-hub==0.16.4
+humanfriendly==10.0
+idna==3.4
+Jinja2==3.1.2
+MarkupSafe==2.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.0
+numpy==1.25.2
+onnx==1.14.0
+onnxruntime==1.15.1
+optimum==1.11.0
+packaging==23.1
+pandas==2.0.3
+protobuf==4.24.0
+pyarrow==14.0.1
+pydantic==2.1.1
+pydantic_core==2.4.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3
+PyYAML==6.0.1
+regex==2023.8.8
+requests==2.31.0
+responses==0.18.0
+safetensors==0.3.2
+sentencepiece==0.1.99
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+sympy==1.11.1
+tokenizers==0.13.3
+torch==2.0.1+cpu
+tqdm==4.65.2
+transformers==4.31.0
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==2.0.7
+uvicorn==0.23.2
+uvloop==0.17.0
+watchfiles==0.19.0
+websockets==11.0.3
+xxhash==3.3.0
+yarl==1.9.2
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/interface.py b/src/interface.py
@@ -0,0 +1,21 @@
+"""Interface type definitions."""
+
+from typing import Literal, Optional
+from pydantic import BaseModel  # pylint:disable=E0611
+
+
+class PredictionRequest(BaseModel):
+    """Single instance of comment to predict."""
+
+    comment: str
+    post: Optional[str]
+
+
+class PredictionResponse(BaseModel):
+    """Single instance of comment prediction"""
+
+    label: Literal["offensive-language", "not-online-violence", "gender-based-violence"]
+    score: float
+    model_version: str
+    comment: str
+    post: str
diff --git a/src/model.py b/src/model.py
@@ -0,0 +1,54 @@
+"""Model for Nuha."""
+from dataclasses import dataclass
+from typing import Literal
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from optimum.pipelines import pipeline
+from transformers import AutoTokenizer
+
+
+@dataclass
+class PredictionResult:
+    """Model prediction result."""
+
+    label: Literal["offensive-language", "not-online-violence", "gender-based-violence"]
+    score: float
+
+
+class Nuha:
+    """Encapsulator for Nuha."""
+
+    BATCH_SIZE = 32
+
+    def __init__(self, model_path: str, model_version: str) -> None:
+        self.model_path = model_path
+        self.model_version = model_version
+        self.device = "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=model_path, revision=model_version
+        )
+        self.model = ORTModelForSequenceClassification.from_pretrained(
+            model_id=model_path, revision=model_version
+        )
+
+        self.classifier = pipeline(
+            task="text-classification",
+            model=self.model,
+            accelerator="ort",
+            tokenizer=self.tokenizer,
+            device=self.device,
+        )
+
+    def predict(self, batch: list[str]) -> list[PredictionResult]:
+        """Run model inference on a batch of comments.
+
+        Returns:
+            list[PredictionResult]: list of labels and scores for each comment
+        """
+        output = self.classifier(batch, batch_size=self.BATCH_SIZE)
+        print(output)
+        return [
+            PredictionResult(
+                label=o["label"].lower().replace(" ", "-"), score=o["score"]
+            )
+            for o in output
+        ]