tfrecords with correct name

Swetha Mandava · Swetha Mandava · commit 9d4c9f3eb01f · 2020-08-13T12:52:46.000-07:00
diff --git a/TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh b/TensorFlow/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -28,26 +28,22 @@ python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
 python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
 python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
 
+DATASET="wikicorpus_en"
 # Properly format the text files
 if [ "$to_download" = "wiki_books" ] ; then
     python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
-fi
-python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
-
-if [ "$to_download" = "wiki_books" ] ; then
     DATASET="books_wiki_en_corpus"
-else
-    DATASET="wikicorpus_en"
 fi
+python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
 
 # Shard the text files
 python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET
 
 # Create TFRecord files Phase 1
-python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 128 \
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 128 \
  --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
 
 
 # Create TFRecord files Phase 2
-python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 512 \
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 512 \
  --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt