tensorflow
diff --git a/‎tftrt/benchmarking-python/huggingface/t5/base_run_inference.sh‎
Lines changed: 123 additions & 0 deletions b/‎tftrt/benchmarking-python/huggingface/t5/base_run_inference.sh‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎tftrt/benchmarking-python/huggingface/t5/dataloader.py‎
Lines changed: 130 additions & 0 deletions b/‎tftrt/benchmarking-python/huggingface/t5/dataloader.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎tftrt/benchmarking-python/huggingface/t5/download_c4.py‎
Lines changed: 0 additions & 1 deletion b/‎tftrt/benchmarking-python/huggingface/t5/download_c4.py‎
Lines changed: 0 additions & 1 deletion
@@ -0,0 +1,123 @@
+#!/bin/bash
+
+nvidia-smi
+
+set -x
+
+BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# Runtime Parameters
+MODEL_NAME=""
+DATASET_NAME="realnewslike"
+
+# Default Argument Values
+BATCH_SIZE=32
+SEQ_LEN=128
+
+NUM_ITERATIONS=1000
+OUTPUT_TENSOR_NAMES="encoder_last_hidden_state,logits,past_key_values"
+
+BYPASS_ARGUMENTS=""
+
+# Loop through arguments and process them
+for arg in "$@"
+do
+    case $arg in
+        --model_name=*)
+        MODEL_NAME="${arg#*=}"
+        shift # Remove --model_name from processing
+        ;;
+        --dataset_name=*)
+        DATASET_NAME="${arg#*=}"
+        shift # Remove --dataset_name= from processing
+        ;;
+        --batch_size=*)
+        BATCH_SIZE="${arg#*=}"
+        shift # Remove --batch_size= from processing
+        ;;
+        --sequence_length=*)
+        SEQ_LEN="${arg#*=}"
+        shift # Remove --sequence_length= from processing
+        ;;
+        --num_iterations=*)
+        NUM_ITERATIONS="${arg#*=}"
+        shift # Remove --num_iterations= from processing
+        ;;
+        --output_tensors_name=*)
+        OUTPUT_TENSOR_NAMES="${arg#*=}"
+        shift # Remove --output_tensors_name= from processing
+        ;;
+        ######### IGNORE ARGUMENTS BELOW
+        --data_dir=*)
+        shift # Remove --data_dir= from processing
+        ;;
+        --input_saved_model_dir=*)
+        shift # Remove --input_saved_model_dir= from processing
+        ;;
+        --tokenizer_model_dir=*)
+        shift # Remove --tokenizer_model_dir= from processing
+        ;;
+        --total_max_samples=*)
+        shift # Remove --total_max_samples= from processing
+        ;;
+        *)
+        BYPASS_ARGUMENTS=" ${BYPASS_ARGUMENTS} ${arg}"
+        ;;
+    esac
+done
+
+echo -e "\n********************************************************************"
+echo "[*] MODEL_NAME: ${MODEL_NAME}"
+echo "[*] DATASET_NAME: ${DATASET_NAME}"
+echo ""
+echo "[*] DATA_DIR: ${DATA_DIR}"
+echo "[*] BATCH_SIZE: ${BATCH_SIZE}"
+echo ""
+# Custom T5 Task Flags
+echo "[*] SEQ_LEN: ${SEQ_LEN}"
+echo "[*] OUTPUT_TENSOR_NAMES: ${OUTPUT_TENSOR_NAMES}"
+echo ""
+echo "[*] BYPASS_ARGUMENTS: $(echo \"${BYPASS_ARGUMENTS}\" | tr -s ' ')"
+
+echo -e "********************************************************************\n"
+
+DATA_DIR="/data/c4/${DATASET_NAME}"
+MODEL_DIR="/models/huggingface/t5/${MODEL_NAME}/saved_models/model"
+TOKENIZER_DIR="/models/huggingface/t5/${MODEL_NAME}/saved_models/tokenizer"
+
+if [[ ! -d ${DATA_DIR} ]]; then
+    echo "ERROR: \`--data_dir=/path/to/directory\` does not exist. [Received: \`${DATA_DIR}\`]"
+    exit 1
+fi
+
+if [[ ! -d ${MODEL_DIR} ]]; then
+    echo "ERROR: \`--input_saved_model_dir=/path/to/directory\` does not exist. [Received: \`${MODEL_DIR}\`]"
+    exit 1
+fi
+
+if [[ ! -d ${TOKENIZER_DIR} ]]; then
+    echo "ERROR: \`--tokenizer_model_dir=/path/to/directory\` does not exist. [Received: \`${TOKENIZER_DIR}\`]"
+    exit 1
+fi
+
+# Install Dependencies
+
+pip install --upgrade \
+    prefetch_generator \
+    orjson \
+    t5==0.4.0
+
+# Dataset Directory
+
+python ${BASE_DIR}/infer.py \
+    --data_dir=${DATA_DIR} \
+    --calib_data_dir=${DATA_DIR} \
+    --input_saved_model_dir=${MODEL_DIR} \
+    --tokenizer_model_dir=${TOKENIZER_DIR}\
+    --vocab_model_dir=${TOKENIZER_DIR}\
+    --output_tensors_name=${OUTPUT_TENSOR_NAMES} \
+    `# The following is set because we will be running synthetic benchmarks` \
+    --total_max_samples=1 \
+    --use_synthetic_data  \
+    --num_iterations=${NUM_ITERATIONS} \
+    ${@}
@@ -0,0 +1,130 @@
+############## Require T5 version 0.4.0 #############
+
+import os
+import glob
+
+try:
+    from prefetch_generator import background
+except ModuleNotFoundError:
+    print("[ERROR] Please install: `pip install --upgrade prefetch_generator`")
+    raise
+
+try:
+    import orjson as json
+except ModuleNotFoundError:
+    print(
+        "[WARNING] To process json data faster, please execute: "
+        "`pip install --upgrade orjson`"
+    )
+    import json
+
+import numpy as np
+import tensorflow as tf
+
+import t5.data.preprocessors as prep
+from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+from transformers import T5Tokenizer
+
+
+def get_dataset_c4(
+    data_dir,
+    vocab_model_dir=None,
+    tokenizer_dir=None,
+    sequence_length=128,
+    batch_size=32,
+    vocab_size=512,
+    noise_density=0.15
+):
+    json_files = sorted(
+        glob.glob(os.path.join(data_dir, "c4-validation.*.json"))
+    )
+
+    if tokenizer_dir is None:
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    else:
+        tokenizer = T5Tokenizer.from_pretrained(tokenizer_dir)
+
+    @background(max_prefetch=1)
+    def jsonfile_parser(filename):
+
+        for line in open(filename):
+            data = json.loads(line)
+
+            yield {
+                "targets":
+                    np.squeeze(
+                        tokenizer(
+                            data["text"],
+                            return_tensors="tf",
+                            max_length=sequence_length,
+                            truncation=True,
+                            padding="max_length",
+                        ).input_ids
+                    )
+            }
+
+    def _get_ds_generator(_filename):
+        return tf.data.Dataset.from_generator(
+            lambda: jsonfile_parser(_filename),
+            output_signature={
+                "targets":
+                    tf.TensorSpec(
+                        shape=(sequence_length,), dtype=tf.int32, name=None
+                    )
+            },
+        ).prefetch(buffer_size=tf.data.AUTOTUNE)
+
+    dataset = tf.data.Dataset.sample_from_datasets(
+        datasets=[_get_ds_generator(_f) for _f in json_files],
+        seed=666,
+        stop_on_empty_dataset=False
+    )
+
+    vocabulary = SentencePieceVocabulary(
+        sentencepiece_model_file=os.path.join(vocab_model_dir, "spiece.model"),
+        extra_ids=0
+    )
+    dataset = prep.denoise(
+        dataset,
+        vocabulary,
+        noise_density=noise_density,
+        noise_mask_fn=prep.random_spans_noise_mask,
+        inputs_fn=prep.noise_token_to_sentinel,
+        targets_fn=None
+    )
+
+    def transform_fn(features):
+        pad_token_id = tokenizer.pad_token_id
+
+        # Decoder token set to pad token by default.
+        decoder_start_token_id = pad_token_id
+
+        # Shift labels to right by one to create decoder inputs.
+        decoder_input_ids = tf.concat([[decoder_start_token_id],
+                                       features["targets"][:-1]],
+                                      axis=0)
+
+        # Change -100 to pad token to prevent ignorance.
+        decoder_input_ids = tf.where(
+            tf.equal(decoder_input_ids, -100),
+            tf.fill(decoder_input_ids.shape.as_list(), pad_token_id),
+            decoder_input_ids
+        )
+
+        # Set All Attention Masks to 1 when no padding on inputs given.
+        return {
+            "attention_mask": tf.ones_like(features["inputs"]),
+            "decoder_attention_mask": tf.ones_like(decoder_input_ids),
+            "decoder_input_ids": decoder_input_ids,
+            "input_ids": features["inputs"],
+            "targets": features["targets"]
+        }
+
+    dataset = dataset.map(transform_fn, num_parallel_calls=tf.data.AUTOTUNE)
+
+    # Prefetch an entire batch of data before batching
+    dataset = dataset.prefetch(buffer_size=batch_size)
+
+    # Then Batch
+    dataset = dataset.batch(batch_size, drop_remainder=False)
+    return dataset
@@ -29,7 +29,6 @@
     },
 }
 
-# _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
 _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,6 @@`
`29`	`29`	`},`
`30`	`30`	`}`
`31`	`31`
`32`		`-# _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"`
`33`	`32`	`_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"`
`34`	`33`
`35`	`34`