update triton for amp

Swetha Mandava · Swetha Mandava · commit 26e47a1083df · 2020-07-20T13:20:09.000-07:00
diff --git a/TensorFlow/LanguageModeling/BERT/Dockerfile b/TensorFlow/LanguageModeling/BERT/Dockerfile
@@ -2,7 +2,7 @@ ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
 
 FROM ${FROM_IMAGE_NAME}
 
-RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl libb64-dev
 RUN pip install --upgrade pip
 RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
 RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger
@@ -17,12 +17,12 @@ RUN git clone https://github.com/titipata/pubmed_parser
 RUN pip3 install /workspace/pubmed_parser
 
 #Copy the perf_client over
-ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.12.0/v1.12.0_ubuntu1804.clients.tar.gz
+ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.14.0/v1.14.0_ubuntu1804.clients.tar.gz
 RUN mkdir -p /workspace/install \
     && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
 
 #Install the python wheel with pip
-RUN pip install /workspace/install/python/tensorrtserver-1.12.0-py3-none-linux_x86_64.whl
+RUN pip install /workspace/install/python/tensorrtserver-1.14.0-py3-none-linux_x86_64.whl
 
 WORKDIR /workspace/bert
 COPY . .
diff --git a/TensorFlow/LanguageModeling/BERT/run_classifier.py b/TensorFlow/LanguageModeling/BERT/run_classifier.py
@@ -379,8 +379,9 @@ def metric_fn(per_example_loss, label_ids, logits):
       dummy_op = tf.no_op()
       # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
       if FLAGS.amp:
+        loss_scaler = tf.train.experimental.FixedLossScale(1)
         dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-            optimization.LAMBOptimizer(learning_rate=0.0))
+            optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)
       eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
       output_spec = tf.estimator.EstimatorSpec(
           mode=mode,
diff --git a/TensorFlow/LanguageModeling/BERT/run_ner.py b/TensorFlow/LanguageModeling/BERT/run_ner.py
@@ -545,8 +545,9 @@ def model_fn(features, labels, mode, params):
             dummy_op = tf.no_op()
             # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
             if amp:
+                loss_scaler = tf.train.experimental.FixedLossScale(1)
                 dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-                    optimization.LAMBOptimizer(learning_rate=0.0))
+                    optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)
 
             def metric_fn(per_example_loss, label_ids, logits):
                 # def metric_fn(label_ids, logits):
diff --git a/TensorFlow/LanguageModeling/BERT/run_re.py b/TensorFlow/LanguageModeling/BERT/run_re.py
@@ -626,8 +626,9 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
             dummy_op = tf.no_op()
             # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
             if amp:
+                loss_scaler = tf.train.experimental.FixedLossScale(1)
                 dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-                    optimization.LAMBOptimizer(learning_rate=0.0))
+                    optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)
 
             def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -325,9 +325,9 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     if init_checkpoint and (hvd is None or hvd.rank() == 0):
       (assignment_map, initialized_variable_names
       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
-
+      
       tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-    
+
     if FLAGS.verbose_logging:
         tf.compat.v1.logging.info("**** Trainable Variables ****")
         for var in tvars:
@@ -370,8 +370,9 @@ def compute_loss(logits, positions):
       dummy_op = tf.no_op()
       # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
       if amp:
+        loss_scaler = tf.train.experimental.FixedLossScale(1)
         dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-            optimization.LAMBOptimizer(learning_rate=0.0))
+            optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)
 
       predictions = {
           "unique_ids": unique_ids,
@@ -807,8 +808,6 @@ def validate_flags_or_throw(bert_config):
 
 def export_model(estimator, export_dir, init_checkpoint):
     """Exports a checkpoint in SavedModel format in a directory structure compatible with Triton."""
-
-
     def serving_input_fn():
         label_ids = tf.placeholder(tf.int32, [None,], name='unique_ids')
         input_ids = tf.placeholder(tf.int32, [None, FLAGS.max_seq_length], name='input_ids')
diff --git a/TensorFlow/LanguageModeling/BERT/triton/scripts/generate_figures.sh b/TensorFlow/LanguageModeling/BERT/triton/scripts/generate_figures.sh
@@ -37,15 +37,6 @@ use_xla=true
 EXPORT_MODEL_ARGS="${precision} ${use_xla} ${seq_length} ${doc_stride} ${BERT_DIR} 1 ${MODEL_NAME}"
 PERF_CLIENT_ARGS="1000 10 20 localhost"
 
-# Start Server
-bash triton/scripts/launch_server.sh $precision
-
-# Restart Server
-restart_server() {
-docker kill triton_server_cont
-bash triton/scripts/launch_server.sh $precision
-}
-
 ############## Dynamic Batching Comparison ##############
 SERVER_BATCH_SIZE=8
 CLIENT_BATCH_SIZE=1
@@ -54,30 +45,22 @@ TRITON_ENGINE_COUNT=1
 # Dynamic batching 10 ms
 TRITON_DYN_BATCHING_DELAY=10
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 # Dynamic batching 5 ms
 TRITON_DYN_BATCHING_DELAY=5
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 # Dynamic batching 2 ms
 TRITON_DYN_BATCHING_DELAY=2
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 
 # Static Batching (i.e. Dynamic batching 0 ms)
 TRITON_DYN_BATCHING_DELAY=0
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 
@@ -89,58 +72,44 @@ TRITON_DYN_BATCHING_DELAY=0
 # Engine Count = 4
 TRITON_ENGINE_COUNT=4
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 # Engine Count = 2
 TRITON_ENGINE_COUNT=2
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 # Engine Count = 1
 TRITON_ENGINE_COUNT=1
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS}
 
 
 ############## Batch Size Comparison ##############
 # BATCH=1 Generate model and perf
 SERVER_BATCH_SIZE=1
 CLIENT_BATCH_SIZE=1
-TRITON_ENGINE_COUNT=1 
-TRITON_DYN_BATCHING_DELAY=0 
+TRITON_ENGINE_COUNT=1
+TRITON_DYN_BATCHING_DELAY=0
 
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 64 localhost
 
 # BATCH=2 Generate model and perf
 SERVER_BATCH_SIZE=2
 CLIENT_BATCH_SIZE=2
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 32 localhost
 
 # BATCH=4 Generate model and perf
 SERVER_BATCH_SIZE=4
 CLIENT_BATCH_SIZE=4
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 16 localhost
 
 # BATCH=8 Generate model and perf
 SERVER_BATCH_SIZE=8
 CLIENT_BATCH_SIZE=8
 bash triton/scripts/export_model.sh ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
-restart_server
-sleep 15
 bash triton/scripts/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 8 localhost
 
diff --git a/TensorFlow/LanguageModeling/BERT/triton/scripts/launch_server.sh b/TensorFlow/LanguageModeling/BERT/triton/scripts/launch_server.sh
@@ -1,16 +1,7 @@
-precision=${1:-"fp16"}
 NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
 
-if [ "$precision" = "fp16" ] ; then
-   echo "fp16 activated!"
-   export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=1
-else
-   echo "fp32 activated!"
-   export TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE=0
-fi
-
 # Start TRITON server in detached state
-docker run --gpus all -d --rm \
+docker run --gpus $NV_VISIBLE_DEVICES --rm -d \
    --shm-size=1g \
    --ulimit memlock=-1 \
    --ulimit stack=67108864 \
@@ -19,6 +10,5 @@ docker run --gpus all -d --rm \
    -p8002:8002 \
    --name triton_server_cont \
    -e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
-   -e TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE \
    -v $PWD/results/triton_models:/models \
-   nvcr.io/nvidia/tritonserver:20.03-py3 trtserver --model-store=/models --strict-model-config=false
+   nvcr.io/nvidia/tritonserver:20.06-v1-py3 tritonserver --model-store=/models --strict-model-config=false
diff --git a/TensorFlow/LanguageModeling/BERT/triton/scripts/run_perf_client.sh b/TensorFlow/LanguageModeling/BERT/triton/scripts/run_perf_client.sh
@@ -32,7 +32,7 @@ then
     if [ ! "$(docker inspect -f "{{.State.Running}}" triton_server_cont)" = "true" ] ; then
 
         echo "Launching TRITON server"
-        bash triton/scripts/launch_server.sh $precision
+        bash triton/scripts/launch_server.sh
         SERVER_LAUNCHED=true
 
         function cleanup_server {
diff --git a/TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton.sh b/TensorFlow/LanguageModeling/BERT/triton/scripts/run_triton.sh
@@ -88,7 +88,7 @@ if [ "$triton_export_model" = "true" ] ; then
 fi
 
 # Start TRTIS server in detached state
-bash triton/scripts/launch_server.sh $precision
+bash triton/scripts/launch_server.sh
 
 # Wait until server is up. curl on the health of the server and sleep until its ready
 bash triton/scripts/wait_for_triton_server.sh localhost