|
| 1 | +## Running distributed MNIST training / inference |
| 2 | + |
| 3 | +### _using Dataset_ |
| 4 | +```bash |
| 5 | +# for CPU mode: |
| 6 | +# export QUEUE=default |
| 7 | +# remove references to $LIB_CUDA |
| 8 | + |
| 9 | +# hdfs dfs -rm -r mnist_model |
| 10 | +# hdfs dfs -rm -r predictions |
| 11 | + |
| 12 | +${SPARK_HOME}/bin/spark-submit \ |
| 13 | +--master yarn \ |
| 14 | +--deploy-mode cluster \ |
| 15 | +--queue ${QUEUE} \ |
| 16 | +--num-executors 4 \ |
| 17 | +--executor-memory 27G \ |
| 18 | +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist_dataset.py \ |
| 19 | +--conf spark.dynamicAllocation.enabled=false \ |
| 20 | +--conf spark.yarn.maxAppAttempts=1 \ |
| 21 | +--archives hdfs:///user/${USER}/Python.zip#Python \ |
| 22 | +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ |
| 23 | +--driver-library-path=$LIB_CUDA \ |
| 24 | +TensorFlowOnSpark/examples/mnist/tf/mnist_spark_dataset.py \ |
| 25 | +${TF_ROOT}/${TF_VERSION}/examples/mnist/tf/mnist_spark_dataset.py \ |
| 26 | +--images_labels mnist/csv2/train \ |
| 27 | +--format csv2 \ |
| 28 | +--mode train \ |
| 29 | +--model mnist_model |
| 30 | + |
| 31 | +# to use inference mode, change `--mode train` to `--mode inference` and add `--output predictions` |
| 32 | +# one item in csv2 format is `image | label`, to use input data in TFRecord format, change `--format csv` to `--format tfr` |
| 33 | +# to use infiniband, add `--rdma` |
| 34 | +``` |
| 35 | + |
| 36 | +### _using QueueRunners_ |
| 37 | +```bash |
| 38 | +# for CPU mode: |
| 39 | +# export QUEUE=default |
| 40 | +# remove references to $LIB_CUDA |
| 41 | + |
| 42 | +# hdfs dfs -rm -r mnist_model |
| 43 | +# hdfs dfs -rm -r predictions |
| 44 | + |
| 45 | +${SPARK_HOME}/bin/spark-submit \ |
| 46 | +--master yarn \ |
| 47 | +--deploy-mode cluster \ |
| 48 | +--queue ${QUEUE} \ |
| 49 | +--num-executors 4 \ |
| 50 | +--executor-memory 27G \ |
| 51 | +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist.py \ |
| 52 | +--conf spark.dynamicAllocation.enabled=false \ |
| 53 | +--conf spark.yarn.maxAppAttempts=1 \ |
| 54 | +--archives hdfs:///user/${USER}/Python.zip#Python \ |
| 55 | +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ |
| 56 | +--driver-library-path=$LIB_CUDA \ |
| 57 | +TensorFlowOnSpark/examples/mnist/tf/mnist_spark.py \ |
| 58 | +--images mnist/tfr/train/images \ |
| 59 | +--labels mnist/tfr/train/labels \ |
| 60 | +--format csv \ |
| 61 | +--mode train \ |
| 62 | +--model mnist_model |
| 63 | + |
| 64 | +# to use inference mode, change `--mode train` to `--mode inference` and add `--output predictions` |
| 65 | +# to use input data in TFRecord format, change `--format csv` to `--format tfr` |
| 66 | +# to use infiniband, add `--rdma` |
| 67 | +``` |
| 68 | + |
| 69 | +### _using Spark ML Pipeline_ |
| 70 | +```bash |
| 71 | +# for CPU mode: |
| 72 | +# export QUEUE=default |
| 73 | +# remove references to $LIB_CUDA |
| 74 | + |
| 75 | +# hdfs dfs -rm -r mnist_model |
| 76 | +# hdfs dfs -rm -r mnist_export |
| 77 | +# hdfs dfs -rm -r tfrecords |
| 78 | +# hdfs dfs -rm -r predictions |
| 79 | + |
| 80 | +${SPARK_HOME}/bin/spark-submit \ |
| 81 | +--master yarn \ |
| 82 | +--deploy-mode cluster \ |
| 83 | +--queue ${QUEUE} \ |
| 84 | +--num-executors 4 \ |
| 85 | +--executor-memory 27G \ |
| 86 | +--jars hdfs:///user/${USER}/tensorflow-hadoop-1.0-SNAPSHOT.jar \ |
| 87 | +--py-files TensorFlowOnSpark/tfspark.zip,TensorFlowOnSpark/examples/mnist/tf/mnist_dist_pipeline.py \ |
| 88 | +--conf spark.dynamicAllocation.enabled=false \ |
| 89 | +--conf spark.yarn.maxAppAttempts=1 \ |
| 90 | +--archives hdfs:///user/${USER}/Python.zip#Python \ |
| 91 | +--conf spark.executorEnv.LD_LIBRARY_PATH=$LIB_CUDA:$LIB_JVM:$LIB_HDFS \ |
| 92 | +--driver-library-path=$LIB_CUDA \ |
| 93 | +TensorFlowOnSpark/examples/mnist/tf/mnist_spark_pipeline.py \ |
| 94 | +--images mnist/csv/train/images \ |
| 95 | +--labels mnist/csv/train/labels \ |
| 96 | +--tfrecord_dir tfrecords \ |
| 97 | +--format csv \ |
| 98 | +--model_dir mnist_model \ |
| 99 | +--export_dir mnist_export \ |
| 100 | +--train \ |
| 101 | +--inference_mode signature \ |
| 102 | +--inference_output predictions |
| 103 | + |
| 104 | +# to use input data in TFRecord format, change `--format csv` to `--format tfr` |
| 105 | +# tensorflow-hadoop-1.0-SNAPSHOT.jar is needed for transforming csv input to TFRecord |
| 106 | +# `--tfrecord_dir` is needed for temporarily saving dataframe to TFRecord on hdfs |
| 107 | +``` |
0 commit comments