Skip to content

Commit c4e4cf0

Browse files
author
mo
committed
K8s: harden jobs, MLflow; runner: support --input JSONL/CSV; add Helm charts for jobs+mlflow
1 parent abfc1fc commit c4e4cf0

File tree

17 files changed

+889
-0
lines changed

17 files changed

+889
-0
lines changed

Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
FROM tensorflow/tensorflow:2.19.0-gpu
2+
3+
WORKDIR /app
4+
5+
# Speed up installs and set HF cache
6+
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
7+
PIP_NO_CACHE_DIR=1 \
8+
HF_HOME=/root/.cache/huggingface
9+
10+
# Copy current repo into image
11+
COPY . /app
12+
13+
# Python deps
14+
RUN python -m pip install --upgrade pip \
15+
&& pip install -r requirements.txt \
16+
&& pip install -r cicd-requirements.txt \
17+
&& pip install mlflow
18+
19+
# Default: show script help; supply args at run-time
20+
CMD ["python", "phishing_email_detection_gpt2.py", "--help"]

README-tokenize-first.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Tokenize-first Phishing Email Classification
2+
3+
This doc explains the new prepare → train workflow for `phishing_email_detection_gpt2.py`.
4+
5+
## Prepare tokens
6+
7+
Input can be a CSV with columns `Email Text, Email Type` (mapped to labels) or a generic CSV with `text,label`, or a JSONL with `{"text": ..., "label": 0|1}`.
8+
9+
Example (small, CPU-safe):
10+
11+
```bash
12+
python phishing_email_detection_gpt2.py --mode prepare \
13+
--in Phishing_Email.csv \
14+
--out data/train_tokens.npz \
15+
--max_len 128 \
16+
--tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
17+
```
18+
19+
## Train from cache
20+
21+
```bash
22+
python phishing_email_detection_gpt2.py --mode train \
23+
--cache data/train_tokens.npz \
24+
--epochs 1 --batch 8 --print-score-only
25+
```
26+
27+
If `MLFLOW_TRACKING_URI` is set, params/metrics and the model artifact are logged to MLflow.
28+
29+
## Docker (GPU-ready)
30+
31+
```bash
32+
# Build
33+
docker build -t thunder/poc:tf2.19 .
34+
35+
# Prepare
36+
docker run --rm -it -v "$PWD":/app --gpus all \
37+
thunder/poc:tf2.19 \
38+
python phishing_email_detection_gpt2.py --mode prepare \
39+
--in Phishing_Email.csv --out data/train_tokens.npz --max_len 128 \
40+
--tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
41+
42+
# Train
43+
docker run --rm -it -v "$PWD":/app --gpus all \
44+
thunder/poc:tf2.19 \
45+
python phishing_email_detection_gpt2.py --mode train \
46+
--cache data/train_tokens.npz --epochs 1 --batch 8 --print-score-only
47+
```
48+
49+
Speed tip: mount your HF cache: `-v $HOME/.cache/huggingface:/root/.cache/huggingface`.
50+
51+
## MLflow backed by Postgres (docker-compose)
52+
53+
Spin up Postgres + MLflow locally (persistent volumes included):
54+
55+
```bash
56+
cd infra/mlflow-postgres
57+
docker compose up -d --build
58+
```
59+
60+
Set your client env and run the scripts:
61+
62+
```bash
63+
export MLFLOW_TRACKING_URI=http://127.0.0.1:5000
64+
```
65+
66+
Stop the stack when done:
67+
68+
```bash
69+
docker compose down
70+
```

data/train_tokens.npz

3.81 MB
Binary file not shown.

helm/cerebros-nlp-poc/Chart.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v2
2+
name: cerebros-nlp-poc
3+
description: Tokenize-first NLP POC jobs (prepare and train) using shared PVC
4+
type: application
5+
version: 0.1.0
6+
appVersion: "0.1.0"
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: {{ .Values.data.pvcName }}
5+
spec:
6+
accessModes: ["ReadWriteOnce"]
7+
{{- if .Values.data.storageClass }}
8+
storageClassName: {{ .Values.data.storageClass | quote }}
9+
{{- end }}
10+
resources:
11+
requests:
12+
storage: {{ .Values.data.size }}
13+
---
14+
{{- if .Values.prepare.enabled }}
15+
apiVersion: batch/v1
16+
kind: Job
17+
metadata:
18+
name: cerebros-prepare
19+
spec:
20+
template:
21+
spec:
22+
restartPolicy: Never
23+
containers:
24+
- name: prepare
25+
image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
26+
imagePullPolicy: {{ .Values.image.pullPolicy }}
27+
env:
28+
- name: MLFLOW_TRACKING_URI
29+
value: {{ .Values.mlflow.trackingUri | quote }}
30+
command: ["python", "phishing_email_detection_gpt2.py"]
31+
args:
32+
- "--mode"; "prepare"
33+
- "--in"; "{{ .Values.prepare.input }}"
34+
- "--out"; "{{ .Values.prepare.output }}"
35+
- "--max_len"; "{{ .Values.prepare.maxLen }}"
36+
- "--tokenizer_checkpoint"; "{{ .Values.prepare.tokenizer }}"
37+
volumeMounts:
38+
- name: data
39+
mountPath: {{ .Values.data.mountPath }}
40+
volumes:
41+
- name: data
42+
persistentVolumeClaim:
43+
claimName: {{ .Values.data.pvcName }}
44+
{{- end }}
45+
---
46+
{{- if .Values.train.enabled }}
47+
apiVersion: batch/v1
48+
kind: Job
49+
metadata:
50+
name: cerebros-train
51+
spec:
52+
template:
53+
spec:
54+
restartPolicy: Never
55+
containers:
56+
- name: train
57+
image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
58+
imagePullPolicy: {{ .Values.image.pullPolicy }}
59+
env:
60+
- name: MLFLOW_TRACKING_URI
61+
value: {{ .Values.mlflow.trackingUri | quote }}
62+
command: ["python", "phishing_email_detection_gpt2.py"]
63+
args:
64+
- "--mode"; "train"
65+
- "--cache"; "{{ .Values.train.cache }}"
66+
- "--epochs"; "{{ .Values.train.epochs }}"
67+
- "--batch"; "{{ .Values.train.batch }}"
68+
{{- if .Values.train.printScoreOnly }}
69+
- "--print-score-only"
70+
{{- end }}
71+
volumeMounts:
72+
- name: data
73+
mountPath: {{ .Values.data.mountPath }}
74+
volumes:
75+
- name: data
76+
persistentVolumeClaim:
77+
claimName: {{ .Values.data.pvcName }}
78+
{{- end }}

helm/cerebros-nlp-poc/values.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
image:
2+
repository: thunder/poc
3+
tag: tf2.19
4+
pullPolicy: IfNotPresent
5+
6+
mlflow:
7+
trackingUri: "http://mlflow-tracking:5000"
8+
9+
data:
10+
pvcName: cerebros-poc-data
11+
mountPath: /app
12+
storageClass: ""
13+
size: 2Gi
14+
15+
prepare:
16+
enabled: true
17+
input: Phishing_Email.csv
18+
output: data/train_tokens.npz
19+
maxLen: 128
20+
tokenizer: HuggingFaceTB/SmolLM3-3B
21+
22+
train:
23+
enabled: true
24+
cache: data/train_tokens.npz
25+
epochs: 1
26+
batch: 8
27+
printScoreOnly: true
28+
29+
resources: {}

helm/mlflow-tracking/Chart.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v2
2+
name: mlflow-tracking
3+
description: MLflow Tracking Server backed by Postgres
4+
type: application
5+
version: 0.1.0
6+
appVersion: "2.14.1"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{{- define "mlflow-tracking.name" -}}
2+
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
3+
{{- end -}}
4+
5+
{{- define "mlflow-tracking.fullname" -}}
6+
{{- if .Values.fullnameOverride -}}
7+
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
8+
{{- else -}}
9+
{{- $name := default .Chart.Name .Values.nameOverride -}}
10+
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
11+
{{- end -}}
12+
{{- end -}}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: {{ include "mlflow-tracking.fullname" . }}
5+
spec:
6+
type: {{ .Values.service.type }}
7+
ports:
8+
- port: {{ .Values.service.port }}
9+
targetPort: http
10+
name: http
11+
selector:
12+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
13+
app.kubernetes.io/instance: {{ .Release.Name }}
14+
app.kubernetes.io/component: mlflow
15+
---
16+
apiVersion: apps/v1
17+
kind: Deployment
18+
metadata:
19+
name: {{ include "mlflow-tracking.fullname" . }}
20+
spec:
21+
replicas: 1
22+
selector:
23+
matchLabels:
24+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
25+
app.kubernetes.io/instance: {{ .Release.Name }}
26+
app.kubernetes.io/component: mlflow
27+
template:
28+
metadata:
29+
labels:
30+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
31+
app.kubernetes.io/instance: {{ .Release.Name }}
32+
app.kubernetes.io/component: mlflow
33+
spec:
34+
containers:
35+
- name: mlflow
36+
image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
37+
imagePullPolicy: {{ .Values.image.pullPolicy }}
38+
ports:
39+
- containerPort: {{ .Values.mlflow.port }}
40+
name: http
41+
env:
42+
- name: BACKEND_STORE_URI
43+
value: postgresql+psycopg2://{{ .Values.postgres.username }}:{{ .Values.postgres.password }}@{{ include "mlflow-tracking.fullname" . }}-postgres:5432/{{ .Values.postgres.database }}
44+
- name: ARTIFACT_ROOT
45+
value: {{ .Values.mlflow.defaultArtifactRoot }}
46+
command: ["mlflow", "server"]
47+
args:
48+
- "--host"
49+
- "{{ .Values.mlflow.host }}"
50+
- "--port"
51+
- "{{ .Values.mlflow.port }}"
52+
- "--backend-store-uri"
53+
- "$(BACKEND_STORE_URI)"
54+
- "--default-artifact-root"
55+
- "$(ARTIFACT_ROOT)"
56+
volumeMounts:
57+
- name: artifacts
58+
mountPath: {{ .Values.mlflow.defaultArtifactRoot }}
59+
resources: {{- toYaml .Values.resources | nindent 12 }}
60+
volumes:
61+
- name: artifacts
62+
persistentVolumeClaim:
63+
claimName: {{ include "mlflow-tracking.fullname" . }}-artifacts
64+
---
65+
apiVersion: v1
66+
kind: PersistentVolumeClaim
67+
metadata:
68+
name: {{ include "mlflow-tracking.fullname" . }}-artifacts
69+
spec:
70+
accessModes: ["ReadWriteOnce"]
71+
{{- if .Values.artifacts.storageClass }}
72+
storageClassName: {{ .Values.artifacts.storageClass | quote }}
73+
{{- end }}
74+
resources:
75+
requests:
76+
storage: {{ .Values.artifacts.size }}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: {{ include "mlflow-tracking.fullname" . }}-postgres
5+
spec:
6+
ports:
7+
- port: 5432
8+
name: postgres
9+
clusterIP: None
10+
selector:
11+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
12+
app.kubernetes.io/instance: {{ .Release.Name }}
13+
app.kubernetes.io/component: postgres
14+
---
15+
apiVersion: apps/v1
16+
kind: StatefulSet
17+
metadata:
18+
name: {{ include "mlflow-tracking.fullname" . }}-postgres
19+
spec:
20+
serviceName: {{ include "mlflow-tracking.fullname" . }}-postgres
21+
replicas: 1
22+
selector:
23+
matchLabels:
24+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
25+
app.kubernetes.io/instance: {{ .Release.Name }}
26+
app.kubernetes.io/component: postgres
27+
template:
28+
metadata:
29+
labels:
30+
app.kubernetes.io/name: {{ include "mlflow-tracking.name" . }}
31+
app.kubernetes.io/instance: {{ .Release.Name }}
32+
app.kubernetes.io/component: postgres
33+
spec:
34+
containers:
35+
- name: postgres
36+
image: {{ .Values.postgres.image }}
37+
ports:
38+
- containerPort: 5432
39+
env:
40+
- name: POSTGRES_USER
41+
value: {{ .Values.postgres.username | quote }}
42+
- name: POSTGRES_PASSWORD
43+
value: {{ .Values.postgres.password | quote }}
44+
- name: POSTGRES_DB
45+
value: {{ .Values.postgres.database | quote }}
46+
volumeMounts:
47+
- name: pgdata
48+
mountPath: /var/lib/postgresql/data
49+
volumeClaimTemplates:
50+
- metadata:
51+
name: pgdata
52+
spec:
53+
accessModes: ["ReadWriteOnce"]
54+
{{- if .Values.artifacts.storageClass }}
55+
storageClassName: {{ .Values.artifacts.storageClass | quote }}
56+
{{- end }}
57+
resources:
58+
requests:
59+
storage: {{ .Values.postgres.storage.size }}

0 commit comments

Comments
 (0)