Phase I: add runner image Dockerfile, standardize CLI (--input/--output, --mlflow_uri, --grad_accum, --print-sample), add labels; mitigate blinker uninstall; fix paths (/data, /app/src).

mo · mo · commit 077c29d4464d · 2025-09-14T17:33:59.000-04:00
diff --git a/Dockerfile.runner b/Dockerfile.runner
@@ -0,0 +1,21 @@
+FROM tensorflow/tensorflow:2.20.0
+
+WORKDIR /app
+
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/root/.cache/huggingface \
+    PYTHONUNBUFFERED=1
+
+# Copy only manifests and requirements first for better layer caching
+COPY requirements.txt cicd-requirements.txt /app/
+
+RUN python -m pip install --upgrade pip \
+ && PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir -r cicd-requirements.txt -r requirements.txt \
+ && PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir mlflow transformers \
+ && PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-cache-dir --ignore-installed blinker
+
+# Copy the rest of the repo
+COPY . /app
+
+ENTRYPOINT ["python", "tokenize_first_runner.py"]
diff --git a/k8s-prepare-job.yaml b/k8s-prepare-job.yaml
@@ -3,6 +3,9 @@ kind: Job
 metadata:
   name: cerebros-prepare-tokens
   namespace: cerebros
+  labels:
+    app: cerebros
+    phase: prepare
 spec:
   template:
     spec:
@@ -22,9 +25,10 @@ spec:
             python -m pip install --upgrade pip
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r requirements.txt
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r cicd-requirements.txt
+            PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages --ignore-installed blinker
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages mlflow transformers
             python tokenize_first_runner.py --mode prepare \
-              --out /data/train_tokens.npz --max_len 128 \
+              --input /data/train.jsonl --output /data/train_tokens.npz --max_len 128 \
               --tokenizer_checkpoint HuggingFaceTB/SmolLM3-3B
         env:
         - name: MLFLOW_TRACKING_URI
diff --git a/k8s-train-job.yaml b/k8s-train-job.yaml
@@ -3,6 +3,9 @@ kind: Job
 metadata:
   name: cerebros-train
   namespace: cerebros
+  labels:
+    app: cerebros
+    phase: train
 spec:
   template:
     spec:
@@ -22,9 +25,10 @@ spec:
             python -m pip install --upgrade pip
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r requirements.txt
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages -r cicd-requirements.txt
+            PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages --ignore-installed blinker
             PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --no-cache-dir --break-system-packages mlflow transformers
             python tokenize_first_runner.py --mode train \
-              --cache /data/train_tokens.npz --epochs 1 --batch 8 --print-score-only
+              --cache /data/train_tokens.npz --epochs 1 --batch 8 --grad_accum 5 --mlflow_uri http://mlflow-service:5000 --print-sample
         env:
         - name: MLFLOW_TRACKING_URI
           value: "http://mlflow-service:5000"
diff --git a/tokenize_first_runner.py b/tokenize_first_runner.py
@@ -176,12 +176,17 @@ def train_from_cache(args):
     print("🏗️ Model compiled")
 
     # Train
+    # Configure MLflow URI if provided
+    if args.mlflow_uri:
+        mlflow.set_tracking_uri(args.mlflow_uri)
+
     with mlflow.start_run():
         mlflow.log_params({
             "vocab_size": vocab_size,
             "max_len": max_len,
             "epochs": args.epochs,
-            "batch_size": args.batch_size
+            "batch_size": args.batch_size,
+            "grad_accum": args.grad_accum
         })
 
         history = model.fit(
@@ -211,6 +216,24 @@ def train_from_cache(args):
         model.save(model_path)
         mlflow.log_artifact(model_path)
 
+        # Optional: print a small decoded sample
+        if args.print_sample:
+            try:
+                # Decode first few tokens of the first test sample
+                first_ids = test_tokens[0][: min(64, max_len)]
+                # When we tokenized, we used only input_ids; decoding with a generic tokenizer requires a checkpoint
+                # We store tokenizer checkpoint in params; try to re-load quickly
+                tokenizer_ckpt = args.tokenizer_checkpoint or "HuggingFaceTB/SmolLM3-3B"
+                tok = AutoTokenizer.from_pretrained(tokenizer_ckpt)
+                text_preview = tok.decode(first_ids, skip_special_tokens=True)
+                sample_path = "sample_preview.txt"
+                with open(sample_path, "w", encoding="utf-8") as f:
+                    f.write(text_preview)
+                mlflow.log_artifact(sample_path)
+                print(f"📝 Sample preview: {text_preview[:200]}")
+            except Exception as e:
+                print(f"⚠️ Could not generate sample preview: {e}")
+
         # Print model size
         model_size_mb = os.path.getsize(model_path) / (1024 * 1024)
         print(f"💾 Model size: {model_size_mb:.2f} MB")
@@ -248,8 +271,14 @@ def main():
                        help="Training epochs")
     parser.add_argument("--batch", "--batch_size", type=int, default=8, dest="batch_size",
                        help="Batch size")
+    parser.add_argument("--grad_accum", type=int, default=1,
+                       help="Gradient accumulation steps (placeholder; model uses batch_size today)")
     parser.add_argument("--print-score-only", action="store_true",
                        help="Print only final scalar score")
+    parser.add_argument("--mlflow_uri", default=os.environ.get("MLFLOW_TRACKING_URI", ""),
+                       help="MLflow tracking URI")
+    parser.add_argument("--print-sample", action="store_true",
+                       help="Decode and log a small text preview from test tokens")
 
     args = parser.parse_args()