AI-Hypercomputer
diff --git a/‎recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh‎
Lines changed: 40 additions & 0 deletions b/‎recml/inference/benchmarks/DLRM_DCNv2/ckpt_load_and_eval.sh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh‎
Lines changed: 38 additions & 0 deletions b/‎recml/inference/benchmarks/DLRM_DCNv2/train_and_checkpoint.sh‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎recml/inference/benchmarks/README.md‎
Lines changed: 63 additions & 0 deletions b/‎recml/inference/benchmarks/README.md‎
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+
+export LIBTPU_INIT_ARGS=
+export XLA_FLAGS=
+
+export TPU_NAME=<TPU_NAME>
+export LEARNING_RATE=0.0034
+export BATCH_SIZE=135168
+export EMBEDDING_SIZE=128
+export MODEL_DIR=/tmp/
+export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
+export NUM_STEPS=28000
+export CHECKPOINT_INTERVAL=1500
+export EVAL_INTERVAL=1500
+export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
+export EVAL_STEPS=660
+export MODE=eval
+export EMBEDDING_THRESHOLD=21000
+export LOGGING_INTERVAL=1500
+export RESTORE_CHECKPOINT=true
+
+
+
+python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+
+--learning_rate=${LEARNING_RATE} \
+--batch_size=${BATCH_SIZE} \
+--embedding_size=${EMBEDDING_SIZE} \
+--embedding_threshold=${EMBEDDING_THRESHOLD} \
+--model_dir=${MODEL_DIR} \
+--file_pattern=${FILE_PATTERN} \
+--num_steps=${NUM_STEPS} \
+--save_checkpoint_interval=${CHECKPOINT_INTERVAL} \
+--restore_checkpoint=${RESTORE_CHECKPOINT} \
+--eval_interval=${EVAL_INTERVAL} \
+--eval_file_pattern=${EVAL_FILE_PATTERN} \
+--eval_steps=${EVAL_STEPS}  \
+--mode=${MODE} \
+--logging_interval=${LOGGING_INTERVAL}
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+
+export LIBTPU_INIT_ARGS=
+export XLA_FLAGS=
+
+export TPU_NAME=<TPU_NAME>
+export LEARNING_RATE=0.0034
+export BATCH_SIZE=135168
+export EMBEDDING_SIZE=128
+export MODEL_DIR=/tmp/
+export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
+export NUM_STEPS=28000
+export CHECKPOINT_INTERVAL=1500
+export EVAL_INTERVAL=1500
+export EVAL_FILE_PATTER=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
+export EVAL_STEPS=660
+export MODE=eval
+export EMBEDDING_THRESHOLD=21000
+export LOGGING_INTERVAL=1500
+export RESTORE_CHECKPOINT=true
+
+python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
+
+--learning_rate=${LEARNING_RATE} \
+--batch_size=${BATCH_SIZE} \
+--embedding_size=${EMBEDDING_SIZE} \
+--embedding_threshold=${EMBEDDING_THRESHOLD} \
+--model_dir=${MODEL_DIR} \
+--file_pattern=${FILE_PATTERN} \
+--num_steps=${NUM_STEPS} \
+--save_checkpoint_interval=${CHECKPOINT_INTERVAL} \
+--restore_checkpoint=${RESTORE_CHECKPOINT} \
+--eval_interval=${EVAL_INTERVAL} \
+--eval_file_pattern=${EVAL_FILE_PATTERN} \
+--eval_steps=${EVAL_STEPS}  \
+--mode=${MODE} \
+--logging_interval=${LOGGING_INTERVAL}
@@ -0,0 +1,63 @@
+
+
+# Running RecML Inference benchmarks
+
+## Setup environment
+
+### Export Env 
+
+```
+export TPU_NAME=
+export QR_NODE_NAME=
+export PROJECT=
+export ZONE=
+export ACCELERATOR_TYPE=
+export RUNTIME_VERSION=
+```
+
+### Launch a TPU VM
+
+```
+gcloud alpha compute tpus queued-resources create ${TPU_NAME} --node-id ${QR_NODE_NAME}$ --project ${PROJECT} --zone ${ZONE} --accelerator-type ${ACCELERATOR_TYPE} --runtime-version ${RUNTIME_VERSION}
+```
+
+### Install dependencies
+
+
+#### Clone the RecML repository
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="git clone https://github.com/AI-Hypercomputer/RecML.git"
+```
+
+#### Install requirements
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="cd RecML && pip install -r requirements.txt" 
+```
+
+#### Install jax and jaxlib nightly
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="pip install -U --pre jax jaxlib libtpu requests -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/libtpu_releases.html --force"
+```
+
+#### Install JAX Sparsecore  (jax-tpu-embedding)
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="pip install -U https://storage.googleapis.com/jax-tpu-embedding-whls/20250604/jax_tpu_embedding-0.1.0.dev20250604-cp310-cp310-manylinux_2_35_x86_64.whl --force"
+```
+
+#### Install other dependencies
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="pip install -U tensorflow  dm-tree flax google-metrax"
+```
+
+#### Run workload
+
+Note: Please update the MODEL_NAME & TASK_NAME before running the below command
+
+```
+gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all  --command="TPU_NAME=${TPU_NAME} ./inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
+```