Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit 05425d2

Browse files
LostnEkkoDEKHTIARJonathan
authored andcommitted
[Benchmarking-Py] Adding HF T5
1 parent 002f035 commit 05425d2

File tree

9 files changed

+555
-1
lines changed

9 files changed

+555
-1
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/bin/bash
2+
3+
nvidia-smi
4+
5+
set -x
6+
7+
BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
8+
9+
# Runtime Parameters
10+
MODEL_NAME=""
11+
DATASET_NAME="realnewslike"
12+
13+
# Default Argument Values
14+
BATCH_SIZE=32
15+
SEQ_LEN=128
16+
17+
NUM_ITERATIONS=1000
18+
OUTPUT_TENSOR_NAMES="encoder_last_hidden_state,logits,past_key_values"
19+
20+
BYPASS_ARGUMENTS=""
21+
22+
# Loop through arguments and process them
23+
for arg in "$@"
24+
do
25+
case $arg in
26+
--model_name=*)
27+
MODEL_NAME="${arg#*=}"
28+
shift # Remove --model_name from processing
29+
;;
30+
--dataset_name=*)
31+
DATASET_NAME="${arg#*=}"
32+
shift # Remove --dataset_name= from processing
33+
;;
34+
--batch_size=*)
35+
BATCH_SIZE="${arg#*=}"
36+
shift # Remove --batch_size= from processing
37+
;;
38+
--sequence_length=*)
39+
SEQ_LEN="${arg#*=}"
40+
shift # Remove --sequence_length= from processing
41+
;;
42+
--num_iterations=*)
43+
NUM_ITERATIONS="${arg#*=}"
44+
shift # Remove --num_iterations= from processing
45+
;;
46+
--output_tensors_name=*)
47+
OUTPUT_TENSOR_NAMES="${arg#*=}"
48+
shift # Remove --output_tensors_name= from processing
49+
;;
50+
######### IGNORE ARGUMENTS BELOW
51+
--data_dir=*)
52+
shift # Remove --data_dir= from processing
53+
;;
54+
--input_saved_model_dir=*)
55+
shift # Remove --input_saved_model_dir= from processing
56+
;;
57+
--tokenizer_model_dir=*)
58+
shift # Remove --tokenizer_model_dir= from processing
59+
;;
60+
--total_max_samples=*)
61+
shift # Remove --total_max_samples= from processing
62+
;;
63+
*)
64+
BYPASS_ARGUMENTS=" ${BYPASS_ARGUMENTS} ${arg}"
65+
;;
66+
esac
67+
done
68+
69+
echo -e "\n********************************************************************"
70+
echo "[*] MODEL_NAME: ${MODEL_NAME}"
71+
echo "[*] DATASET_NAME: ${DATASET_NAME}"
72+
echo ""
73+
echo "[*] DATA_DIR: ${DATA_DIR}"
74+
echo "[*] BATCH_SIZE: ${BATCH_SIZE}"
75+
echo ""
76+
# Custom T5 Task Flags
77+
echo "[*] SEQ_LEN: ${SEQ_LEN}"
78+
echo "[*] OUTPUT_TENSOR_NAMES: ${OUTPUT_TENSOR_NAMES}"
79+
echo ""
80+
echo "[*] BYPASS_ARGUMENTS: $(echo \"${BYPASS_ARGUMENTS}\" | tr -s ' ')"
81+
82+
echo -e "********************************************************************\n"
83+
84+
DATA_DIR="/data/c4/${DATASET_NAME}"
85+
MODEL_DIR="/models/huggingface/t5/${MODEL_NAME}/saved_models/model"
86+
TOKENIZER_DIR="/models/huggingface/t5/${MODEL_NAME}/saved_models/tokenizer"
87+
88+
if [[ ! -d ${DATA_DIR} ]]; then
89+
echo "ERROR: \`--data_dir=/path/to/directory\` does not exist. [Received: \`${DATA_DIR}\`]"
90+
exit 1
91+
fi
92+
93+
if [[ ! -d ${MODEL_DIR} ]]; then
94+
echo "ERROR: \`--input_saved_model_dir=/path/to/directory\` does not exist. [Received: \`${MODEL_DIR}\`]"
95+
exit 1
96+
fi
97+
98+
if [[ ! -d ${TOKENIZER_DIR} ]]; then
99+
echo "ERROR: \`--tokenizer_model_dir=/path/to/directory\` does not exist. [Received: \`${TOKENIZER_DIR}\`]"
100+
exit 1
101+
fi
102+
103+
# Install Dependencies
104+
105+
pip install --upgrade \
106+
prefetch_generator \
107+
orjson \
108+
t5==0.4.0
109+
110+
# Dataset Directory
111+
112+
python ${BASE_DIR}/infer.py \
113+
--data_dir=${DATA_DIR} \
114+
--calib_data_dir=${DATA_DIR} \
115+
--input_saved_model_dir=${MODEL_DIR} \
116+
--tokenizer_model_dir=${TOKENIZER_DIR}\
117+
--vocab_model_dir=${TOKENIZER_DIR}\
118+
--output_tensors_name=${OUTPUT_TENSOR_NAMES} \
119+
`# The following is set because we will be running synthetic benchmarks` \
120+
--total_max_samples=1 \
121+
--use_synthetic_data \
122+
--num_iterations=${NUM_ITERATIONS} \
123+
${@}
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
############## Require T5 version 0.4.0 #############
2+
3+
import os
4+
import glob
5+
6+
try:
7+
from prefetch_generator import background
8+
except ModuleNotFoundError:
9+
print("[ERROR] Please install: `pip install --upgrade prefetch_generator`")
10+
raise
11+
12+
try:
13+
import orjson as json
14+
except ModuleNotFoundError:
15+
print(
16+
"[WARNING] To process json data faster, please execute: "
17+
"`pip install --upgrade orjson`"
18+
)
19+
import json
20+
21+
import numpy as np
22+
import tensorflow as tf
23+
24+
import t5.data.preprocessors as prep
25+
from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
26+
from transformers import T5Tokenizer
27+
28+
29+
def get_dataset_c4(
30+
data_dir,
31+
vocab_model_dir=None,
32+
tokenizer_dir=None,
33+
sequence_length=128,
34+
batch_size=32,
35+
vocab_size=512,
36+
noise_density=0.15
37+
):
38+
json_files = sorted(
39+
glob.glob(os.path.join(data_dir, "c4-validation.*.json"))
40+
)
41+
42+
if tokenizer_dir is None:
43+
tokenizer = T5Tokenizer.from_pretrained("t5-small")
44+
else:
45+
tokenizer = T5Tokenizer.from_pretrained(tokenizer_dir)
46+
47+
@background(max_prefetch=1)
48+
def jsonfile_parser(filename):
49+
50+
for line in open(filename):
51+
data = json.loads(line)
52+
53+
yield {
54+
"targets":
55+
np.squeeze(
56+
tokenizer(
57+
data["text"],
58+
return_tensors="tf",
59+
max_length=sequence_length,
60+
truncation=True,
61+
padding="max_length",
62+
).input_ids
63+
)
64+
}
65+
66+
def _get_ds_generator(_filename):
67+
return tf.data.Dataset.from_generator(
68+
lambda: jsonfile_parser(_filename),
69+
output_signature={
70+
"targets":
71+
tf.TensorSpec(
72+
shape=(sequence_length,), dtype=tf.int32, name=None
73+
)
74+
},
75+
).prefetch(buffer_size=tf.data.AUTOTUNE)
76+
77+
dataset = tf.data.Dataset.sample_from_datasets(
78+
datasets=[_get_ds_generator(_f) for _f in json_files],
79+
seed=666,
80+
stop_on_empty_dataset=False
81+
)
82+
83+
vocabulary = SentencePieceVocabulary(
84+
sentencepiece_model_file=os.path.join(vocab_model_dir, "spiece.model"),
85+
extra_ids=0
86+
)
87+
dataset = prep.denoise(
88+
dataset,
89+
vocabulary,
90+
noise_density=noise_density,
91+
noise_mask_fn=prep.random_spans_noise_mask,
92+
inputs_fn=prep.noise_token_to_sentinel,
93+
targets_fn=None
94+
)
95+
96+
def transform_fn(features):
97+
pad_token_id = tokenizer.pad_token_id
98+
99+
# Decoder token set to pad token by default.
100+
decoder_start_token_id = pad_token_id
101+
102+
# Shift labels to right by one to create decoder inputs.
103+
decoder_input_ids = tf.concat([[decoder_start_token_id],
104+
features["targets"][:-1]],
105+
axis=0)
106+
107+
# Change -100 to pad token to prevent ignorance.
108+
decoder_input_ids = tf.where(
109+
tf.equal(decoder_input_ids, -100),
110+
tf.fill(decoder_input_ids.shape.as_list(), pad_token_id),
111+
decoder_input_ids
112+
)
113+
114+
# Set All Attention Masks to 1 when no padding on inputs given.
115+
return {
116+
"attention_mask": tf.ones_like(features["inputs"]),
117+
"decoder_attention_mask": tf.ones_like(decoder_input_ids),
118+
"decoder_input_ids": decoder_input_ids,
119+
"input_ids": features["inputs"],
120+
"targets": features["targets"]
121+
}
122+
123+
dataset = dataset.map(transform_fn, num_parallel_calls=tf.data.AUTOTUNE)
124+
125+
# Prefetch an entire batch of data before batching
126+
dataset = dataset.prefetch(buffer_size=batch_size)
127+
128+
# Then Batch
129+
dataset = dataset.batch(batch_size, drop_remainder=False)
130+
return dataset

tftrt/benchmarking-python/huggingface/t5/download_c4.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
},
3030
}
3131

32-
# _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
3332
_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/607bd4c8450a42878aa9ddc051a65a055450ef87/{name}/c4-{split}.{index:05d}-of-{n_shards:05d}.json.gz"
3433

3534

0 commit comments

Comments
 (0)