david-thrower
diff --git a/‎Dockerfile‎
Lines changed: 20 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎README-tokenize-first.md‎
Lines changed: 70 additions & 0 deletions b/‎README-tokenize-first.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎data/train_tokens.npz‎
3.81 MB b/‎data/train_tokens.npz‎
3.81 MB
diff --git a/‎helm/cerebros-nlp-poc/Chart.yaml‎
Lines changed: 6 additions & 0 deletions b/‎helm/cerebros-nlp-poc/Chart.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎helm/cerebros-nlp-poc/templates/jobs.yaml‎
Lines changed: 78 additions & 0 deletions b/‎helm/cerebros-nlp-poc/templates/jobs.yaml‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎helm/cerebros-nlp-poc/values.yaml‎
Lines changed: 29 additions & 0 deletions b/‎helm/cerebros-nlp-poc/values.yaml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎helm/mlflow-tracking/Chart.yaml‎
Lines changed: 6 additions & 0 deletions b/‎helm/mlflow-tracking/Chart.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎helm/mlflow-tracking/templates/_helpers.tpl‎
Lines changed: 12 additions & 0 deletions b/‎helm/mlflow-tracking/templates/_helpers.tpl‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎helm/mlflow-tracking/templates/mlflow.yaml‎
Lines changed: 76 additions & 0 deletions b/‎helm/mlflow-tracking/templates/mlflow.yaml‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎helm/mlflow-tracking/templates/postgres.yaml‎
Lines changed: 59 additions & 0 deletions b/‎helm/mlflow-tracking/templates/postgres.yaml‎
Lines changed: 59 additions & 0 deletions
@@ -0,0 +1,20 @@
+FROM tensorflow/tensorflow:2.19.0-gpu
+
+WORKDIR /app
+
+# Speed up installs and set HF cache
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/root/.cache/huggingface
+
+# Copy current repo into image
+COPY . /app
+
+# Python deps
+RUN python -m pip install --upgrade pip \
+ && pip install -r requirements.txt \
+ && pip install -r cicd-requirements.txt \
+ && pip install mlflow
+
+# Default: show script help; supply args at run-time
+CMD ["python", "phishing_email_detection_gpt2.py", "--help"]
@@ -0,0 +1,70 @@
+# Tokenize-first Phishing Email Classification
+
+This doc explains the new prepare → train workflow for `phishing_email_detection_gpt2.py`.
+
+## Prepare tokens
+
+Input can be a CSV with columns `Email Text, Email Type` (mapped to labels) or a generic CSV with `text,label`, or a JSONL with `{"text": ..., "label": 0|1}`.
+
+Example (small, CPU-safe):
+
+```bash
+python phishing_email_detection_gpt2.py --mode prepare \
+  --in Phishing_Email.csv \
+  --out data/train_tokens.npz \
+  --max_len 128 \
+  --tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
+```
+
+## Train from cache
+
+```bash
+python phishing_email_detection_gpt2.py --mode train \
+  --cache data/train_tokens.npz \
+  --epochs 1 --batch 8 --print-score-only
+```
+
+If `MLFLOW_TRACKING_URI` is set, params/metrics and the model artifact are logged to MLflow.
+
+## Docker (GPU-ready)
+
+```bash
+# Build
+docker build -t thunder/poc:tf2.19 .
+
+# Prepare
+docker run --rm -it -v "$PWD":/app --gpus all \
+  thunder/poc:tf2.19 \
+  python phishing_email_detection_gpt2.py --mode prepare \
+  --in Phishing_Email.csv --out data/train_tokens.npz --max_len 128 \
+  --tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
+
+# Train
+docker run --rm -it -v "$PWD":/app --gpus all \
+  thunder/poc:tf2.19 \
+  python phishing_email_detection_gpt2.py --mode train \
+  --cache data/train_tokens.npz --epochs 1 --batch 8 --print-score-only
+```
+
+Speed tip: mount your HF cache: `-v $HOME/.cache/huggingface:/root/.cache/huggingface`.
+
+## MLflow backed by Postgres (docker-compose)
+
+Spin up Postgres + MLflow locally (persistent volumes included):
+
+```bash
+cd infra/mlflow-postgres
+docker compose up -d --build
+```
+
+Set your client env and run the scripts:
+
+```bash
+export MLFLOW_TRACKING_URI=http://127.0.0.1:5000
+```
+
+Stop the stack when done:
+
+```bash
+docker compose down
+```
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: cerebros-nlp-poc
+description: Tokenize-first NLP POC jobs (prepare and train) using shared PVC
+type: application
+version: 0.1.0
+appVersion: "0.1.0"
@@ -0,0 +1,78 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Values.data.pvcName }}
+spec:
+  accessModes: ["ReadWriteOnce"]
+  {{- if .Values.data.storageClass }}
+  storageClassName: {{ .Values.data.storageClass | quote }}
+  {{- end }}
+  resources:
+    requests:
+      storage: {{ .Values.data.size }}
+---
+{{- if .Values.prepare.enabled }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: cerebros-prepare
+spec:
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: prepare
+          image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            - name: MLFLOW_TRACKING_URI
+              value: {{ .Values.mlflow.trackingUri | quote }}
+          command: ["python", "phishing_email_detection_gpt2.py"]
+          args:
+            - "--mode"; "prepare"
+            - "--in"; "{{ .Values.prepare.input }}"
+            - "--out"; "{{ .Values.prepare.output }}"
+            - "--max_len"; "{{ .Values.prepare.maxLen }}"
+            - "--tokenizer_checkpoint"; "{{ .Values.prepare.tokenizer }}"
+          volumeMounts:
+            - name: data
+              mountPath: {{ .Values.data.mountPath }}
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: {{ .Values.data.pvcName }}
+{{- end }}
+---
+{{- if .Values.train.enabled }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: cerebros-train
+spec:
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: train
+          image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            - name: MLFLOW_TRACKING_URI
+              value: {{ .Values.mlflow.trackingUri | quote }}
+          command: ["python", "phishing_email_detection_gpt2.py"]
+          args:
+            - "--mode"; "train"
+            - "--cache"; "{{ .Values.train.cache }}"
+            - "--epochs"; "{{ .Values.train.epochs }}"
+            - "--batch"; "{{ .Values.train.batch }}"
+            {{- if .Values.train.printScoreOnly }}
+            - "--print-score-only"
+            {{- end }}
+          volumeMounts:
+            - name: data
+              mountPath: {{ .Values.data.mountPath }}
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: {{ .Values.data.pvcName }}
+{{- end }}
@@ -0,0 +1,29 @@
+image:
+  repository: thunder/poc
+  tag: tf2.19
+  pullPolicy: IfNotPresent
+
+mlflow:
+  trackingUri: "http://mlflow-tracking:5000"
+
+data:
+  pvcName: cerebros-poc-data
+  mountPath: /app
+  storageClass: ""
+  size: 2Gi
+
+prepare:
+  enabled: true
+  input: Phishing_Email.csv
+  output: data/train_tokens.npz
+  maxLen: 128
+  tokenizer: HuggingFaceTB/SmolLM3-3B
+
+train:
+  enabled: true
+  cache: data/train_tokens.npz
+  epochs: 1
+  batch: 8
+  printScoreOnly: true
+
+resources: {}
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: mlflow-tracking
+description: MLflow Tracking Server backed by Postgres
+type: application
+version: 0.1.0
+appVersion: "2.14.1"
@@ -0,0 +1,12 @@
+{{- define "mlflow-tracking.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{- define "mlflow-tracking.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
@@ -0,0 +1,76 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "mlflow-tracking.fullname" . }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: http
+      name: http
+  selector:
+    app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/component: mlflow
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "mlflow-tracking.fullname" . }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+      app.kubernetes.io/instance: {{ .Release.Name }}
+      app.kubernetes.io/component: mlflow
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+        app.kubernetes.io/instance: {{ .Release.Name }}
+        app.kubernetes.io/component: mlflow
+    spec:
+      containers:
+        - name: mlflow
+          image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - containerPort: {{ .Values.mlflow.port }}
+              name: http
+          env:
+            - name: BACKEND_STORE_URI
+              value: postgresql+psycopg2://{{ .Values.postgres.username }}:{{ .Values.postgres.password }}@{{ include "mlflow-tracking.fullname" . }}-postgres:5432/{{ .Values.postgres.database }}
+            - name: ARTIFACT_ROOT
+              value: {{ .Values.mlflow.defaultArtifactRoot }}
+          command: ["mlflow", "server"]
+          args:
+            - "--host"
+            - "{{ .Values.mlflow.host }}"
+            - "--port"
+            - "{{ .Values.mlflow.port }}"
+            - "--backend-store-uri"
+            - "$(BACKEND_STORE_URI)"
+            - "--default-artifact-root"
+            - "$(ARTIFACT_ROOT)"
+          volumeMounts:
+            - name: artifacts
+              mountPath: {{ .Values.mlflow.defaultArtifactRoot }}
+          resources: {{- toYaml .Values.resources | nindent 12 }}
+      volumes:
+        - name: artifacts
+          persistentVolumeClaim:
+            claimName: {{ include "mlflow-tracking.fullname" . }}-artifacts
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "mlflow-tracking.fullname" . }}-artifacts
+spec:
+  accessModes: ["ReadWriteOnce"]
+  {{- if .Values.artifacts.storageClass }}
+  storageClassName: {{ .Values.artifacts.storageClass | quote }}
+  {{- end }}
+  resources:
+    requests:
+      storage: {{ .Values.artifacts.size }}
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "mlflow-tracking.fullname" . }}-postgres
+spec:
+  ports:
+    - port: 5432
+      name: postgres
+  clusterIP: None
+  selector:
+    app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/component: postgres
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: {{ include "mlflow-tracking.fullname" . }}-postgres
+spec:
+  serviceName: {{ include "mlflow-tracking.fullname" . }}-postgres
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+      app.kubernetes.io/instance: {{ .Release.Name }}
+      app.kubernetes.io/component: postgres
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
+        app.kubernetes.io/instance: {{ .Release.Name }}
+        app.kubernetes.io/component: postgres
+    spec:
+      containers:
+        - name: postgres
+          image: {{ .Values.postgres.image }}
+          ports:
+            - containerPort: 5432
+          env:
+            - name: POSTGRES_USER
+              value: {{ .Values.postgres.username | quote }}
+            - name: POSTGRES_PASSWORD
+              value: {{ .Values.postgres.password | quote }}
+            - name: POSTGRES_DB
+              value: {{ .Values.postgres.database | quote }}
+          volumeMounts:
+            - name: pgdata
+              mountPath: /var/lib/postgresql/data
+  volumeClaimTemplates:
+    - metadata:
+        name: pgdata
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        {{- if .Values.artifacts.storageClass }}
+        storageClassName: {{ .Values.artifacts.storageClass | quote }}
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.postgres.storage.size }}