Skip to content

Commit 077c29d

Browse files
author
mo
committed
Phase I: add runner image Dockerfile, standardize CLI (--input/--output, --mlflow_uri, --grad_accum, --print-sample), add labels; mitigate blinker uninstall; fix paths (/data, /app/src).
1 parent 0deb0d6 commit 077c29d

File tree

4 files changed

+61
-3
lines changed

4 files changed

+61
-3
lines changed

Dockerfile.runner

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FROM tensorflow/tensorflow:2.20.0
2+
3+
WORKDIR /app
4+
5+
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
6+
PIP_NO_CACHE_DIR=1 \
7+
HF_HOME=/root/.cache/huggingface \
8+
PYTHONUNBUFFERED=1
9+
10+
# Copy only manifests and requirements first for better layer caching
11+
COPY requirements.txt cicd-requirements.txt /app/
12+
13+
RUN python -m pip install --upgrade pip \
14+
&& PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir -r cicd-requirements.txt -r requirements.txt \
15+
&& PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir mlflow transformers \
16+
&& PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir --ignore-installed blinker
17+
18+
# Copy the rest of the repo
19+
COPY . /app
20+
21+
ENTRYPOINT ["python", "tokenize_first_runner.py"]

k8s-prepare-job.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ kind: Job
33
metadata:
44
name: cerebros-prepare-tokens
55
namespace: cerebros
6+
labels:
7+
app: cerebros
8+
phase: prepare
69
spec:
710
template:
811
spec:
@@ -22,9 +25,10 @@ spec:
2225
python -m pip install --upgrade pip
2326
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r requirements.txt
2427
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r cicd-requirements.txt
28+
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages --ignore-installed blinker
2529
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages mlflow transformers
2630
python tokenize_first_runner.py --mode prepare \
27-
--out /data/train_tokens.npz --max_len 128 \
31+
--input /data/train.jsonl --output /data/train_tokens.npz --max_len 128 \
2832
--tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
2933
env:
3034
- name: MLFLOW_TRACKING_URI

k8s-train-job.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ kind: Job
33
metadata:
44
name: cerebros-train
55
namespace: cerebros
6+
labels:
7+
app: cerebros
8+
phase: train
69
spec:
710
template:
811
spec:
@@ -22,9 +25,10 @@ spec:
2225
python -m pip install --upgrade pip
2326
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r requirements.txt
2427
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r cicd-requirements.txt
28+
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages --ignore-installed blinker
2529
PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages mlflow transformers
2630
python tokenize_first_runner.py --mode train \
27-
--cache /data/train_tokens.npz --epochs 1 --batch 8 --print-score-only
31+
--cache /data/train_tokens.npz --epochs 1 --batch 8 --grad_accum 5 --mlflow_uri http://mlflow-service:5000 --print-sample
2832
env:
2933
- name: MLFLOW_TRACKING_URI
3034
value: "http://mlflow-service:5000"

tokenize_first_runner.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,17 @@ def train_from_cache(args):
176176
print("🏗️ Model compiled")
177177

178178
# Train
179+
# Configure MLflow URI if provided
180+
if args.mlflow_uri:
181+
mlflow.set_tracking_uri(args.mlflow_uri)
182+
179183
with mlflow.start_run():
180184
mlflow.log_params({
181185
"vocab_size": vocab_size,
182186
"max_len": max_len,
183187
"epochs": args.epochs,
184-
"batch_size": args.batch_size
188+
"batch_size": args.batch_size,
189+
"grad_accum": args.grad_accum
185190
})
186191

187192
history = model.fit(
@@ -211,6 +216,24 @@ def train_from_cache(args):
211216
model.save(model_path)
212217
mlflow.log_artifact(model_path)
213218

219+
# Optional: print a small decoded sample
220+
if args.print_sample:
221+
try:
222+
# Decode first few tokens of the first test sample
223+
first_ids = test_tokens[0][: min(64, max_len)]
224+
# When we tokenized, we used only input_ids; decoding with a generic tokenizer requires a checkpoint
225+
# We store tokenizer checkpoint in params; try to re-load quickly
226+
tokenizer_ckpt = args.tokenizer_checkpoint or "HuggingFaceTB/SmolLM3-3B"
227+
tok = AutoTokenizer.from_pretrained(tokenizer_ckpt)
228+
text_preview = tok.decode(first_ids, skip_special_tokens=True)
229+
sample_path = "sample_preview.txt"
230+
with open(sample_path, "w", encoding="utf-8") as f:
231+
f.write(text_preview)
232+
mlflow.log_artifact(sample_path)
233+
print(f"📝 Sample preview: {text_preview[:200]}")
234+
except Exception as e:
235+
print(f"⚠️ Could not generate sample preview: {e}")
236+
214237
# Print model size
215238
model_size_mb = os.path.getsize(model_path) / (1024 * 1024)
216239
print(f"💾 Model size: {model_size_mb:.2f} MB")
@@ -248,8 +271,14 @@ def main():
248271
help="Training epochs")
249272
parser.add_argument("--batch", "--batch_size", type=int, default=8, dest="batch_size",
250273
help="Batch size")
274+
parser.add_argument("--grad_accum", type=int, default=1,
275+
help="Gradient accumulation steps (placeholder; model uses batch_size today)")
251276
parser.add_argument("--print-score-only", action="store_true",
252277
help="Print only final scalar score")
278+
parser.add_argument("--mlflow_uri", default=os.environ.get("MLFLOW_TRACKING_URI", ""),
279+
help="MLflow tracking URI")
280+
parser.add_argument("--print-sample", action="store_true",
281+
help="Decode and log a small text preview from test tokens")
253282

254283
args = parser.parse_args()
255284

0 commit comments

Comments
 (0)