def gpt2_test4gpu350M(**kwargs):
model_kwargs = dict(hidden_size=1024, depth=24, num_heads=16,max_position_embeddings=2048, **kwargs)
return _create_gpt_model(**model_kwargs)
@DATASETS.register_module
class WebtextDataset(Dataset):
def __init__(self, path=None, seq_len=1024, mbs = 4) -> None:
super().__init__()
if path is not None:
root = os.path.dirname(path)
encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
else:
encoded_data_cache_path = f'gpt_webtext_{seq_len}.pt'
self.data = torch.randint(0,10000,(seq_len, ), requires_grad=False, device=torch.device('cpu')).long()
self.attention_mask = (torch.rand((seq_len, seq_len), requires_grad=False, device=torch.device('cpu')))
self.attention_mask = torch.where(self.attention_mask < 0.5, 0, 1)
print("self.atttntion_mask:",self.attention_mask[:20])
self.mbs =mbs
print("self.mbs:",self.mbs)
torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
def __len__(self):
print("WebtextDataset,self.mbs*3:",self.mbs) ## len(train_loader) :返回的是len(dataset)/batch_size
return self.mbs * 5
def __getitem__(self, index):
return {'input_ids':self.data,
'attention_mask': self.attention_mask[0]}, self.data
I run this model and colossalai just spends 1s for 1 iteration. But I run the same model on Megatron-LM and I need about 100s for one iteration.