-
Notifications
You must be signed in to change notification settings - Fork 1k
Description
`# Developed by Aamir Mirza
create a conda virtual environment python 3.9
install PyTorch 1.13.1 ( not 2.0)
conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7 -c pytorch -c nvidia
install the latest transformers
conda install -c conda-forge transformers
install deepspeed from GitHub not pip install
build deepspeed with CPU Adam optimiser support like this
git clone https://github.com/microsoft/DeepSpeed
DS_BUILD_CPU_ADAM=1 pip install .
accelerate via pip
pip install Ninja
conda install -c conda-forge mpi4py
train via commandline for example
deepspeed train_gptNX_v2.py --num_gpus=2
In my case I have 2x 3090 24GB
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast, TextDataset,
DefaultDataCollator, DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import os
os.environ['OMPI_MCA_opal_cuda_support'] = 'true'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
If you got a single GPU then change this to one
os.environ["WORLD_SIZE"] = "2"
Change this to your requirement for example 4096 (MAX)
MAX_LEN = 1024
stage2_config = """{
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
} """
class CustomTrainer(Trainer):
def compute_loss(self, model_a, inputs_a, return_outputs=False):
strd = ' '
outputs = model_a(**inputs_a, labels=inputs_a["input_ids"])
loss = outputs.loss
return (loss, outputs) if return_outputs else loss
tokenizer = GPTNeoXTokenizerFast.from_pretrained("stabilityai/stablelm-base-alpha-3b")
def process_data(examples):
texts = examples["text"]
# Remove empty lines
texts = [text for text in texts if len(text) > 0 and not text.isspace()]
# Remove lines that are too long
texts = [text for text in texts if len(text) < 512]
# Remove lines that are too short
texts = [text for text in texts if len(text) > 16]
# add newline character
texts = [text + ' ' + '\n' for text in texts]
examples["text"] = texts
return examples
process dataset columns [text] use tokenizer to get input_ids and attention mask
def process_data_add_mask(examples):
text = examples['text']
tokenizer.pad_token = tokenizer.eos_token
# Tokenize text
encoded_dict = tokenizer(
text,
padding=True,
truncation=True,
max_length=MAX_LEN
)
# Add input_ids and attention_mask to example
examples['input_ids'] = encoded_dict['input_ids']
examples['attention_mask'] = encoded_dict['attention_mask']
return examples
imdb_dataset = load_dataset('imdb')
imdb_dataset_train = imdb_dataset['train']
imdb_dataset_train = imdb_dataset_train.shuffle()
imdb_dataset_train = imdb_dataset_train.map(process_data, batched=True, remove_columns=['label'])
imdb_dataset_val = imdb_dataset['test']
imdb_dataset_val = imdb_dataset_val.shuffle()
imdb_dataset_val = imdb_dataset_val.map(process_data, batched=True, remove_columns=['label'])
train_dataset = imdb_dataset_train.map(process_data_add_mask, remove_columns=["text"], batched=True)
val_dataset = imdb_dataset_val.map(process_data_add_mask, remove_columns=["text"], batched=True)
strs = " "
model = GPTNeoXForCausalLM.from_pretrained("stabilityai/stablelm-base-alpha-3b")
absolute path required for deepspeed config
you can use the JSON above to create your own config
z_optimiser = '/two-tb/train_GPTNX/zeromq_config/stablelm-base-alpha-3b_config.json'
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
return_tensors="pt")
training_args_v2 = TrainingArguments(
output_dir="./trained_model",
learning_rate=2e-5,
save_total_limit=2,
fp16=True,
per_device_train_batch_size=1,
per_device_eval_batch_size=12,
evaluation_strategy="epoch",
deepspeed=z_optimiser,
num_train_epochs=1
)
Set up the trainer
trainer = CustomTrainer(
model=model,
args=training_args_v2,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
trainer.save_model()
`