dali tfrecord pipeline sharding on multi GPU environment #12792

tian-yi-zefr · 2022-04-18T03:09:15Z

tian-yi-zefr
Apr 18, 2022

Hey guys, I got a self contained and working example of pytorch lightning and dali tfrecord pipeline on mutli GPU environment, and some have questions regarding to GPU sharding and training by pytorch lightning.

questions:

it looks like output of make_dali_dataloader is a list of Dict[str, torch.Tensor] and the list length matches number of shards in example 1 dataloader and example 2 dataloader. Is that expected? and is each Dict[str, torch.Tensor] in the output list the sharding data?
for pytorch DDP training, should I use global_rank to retrieve sharding data in process_batch function? which one of following two process_batch methods should I use, if both are wrong, can you provide an example?

def process_batch(self, batch):
 # should global_rank be provided in training?
    x = batch[self.global_rank]["features"]
    y = batch[self.global_rank]["label"].reshape(-1).type(torch.long)
    return x,y

or

def process_batch(self, batch):
 # should global_rank be provided in training?
    x = batch[0]["features"]
    y = batch[0]["label"].reshape(-1).type(torch.long)
    return x,y

when number of shards in make_dali_dataloader matches GPU devices (1st make_dali_dataloader), the total training examples are about 1 epoch. but when number of shards in make_dali_dataloader does not match GPU devices, the total training examples can be more than 1 epoch, in my case, 1 epoch should be 1k, but 2nd make_dali_dataloader returns total of 2.8k from 8 GPUs, how is one epoch defined in DDP training?
in my real training example (not in this self contained script, but quite similar), I found the validation loss curve with pytorch dataloader is much smoother than the one trained by dali dataloader. The global_step in model trained by dali dataloader (3.3k steps) seems to have much smaller than pytorch dataloader one (10k steps). I wonder what may cause this issue? Is this related to sharding issue?

major dependency versions running on ubuntu 18.04

torch==1.9.0
tfrecord==1.14.1
pytorch-lightning==1.2.6
pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110

self contained script to reproduce

import torch
from torch.nn import functional as F
from torch import nn
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer
from torch.optim import Adam
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

import tfrecord 
import numpy as np 
from multiprocessing import Pool, cpu_count
import subprocess
from typing import Dict, List, Tuple, Union

import os
import pandas as pd 
import time 


DATA_DIR = './temp'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

N_GPUS = torch.cuda.device_count() 
# N_GPUS = 2
print (f"N_GPUS = {N_GPUS}")
N_CPUS = cpu_count()
# N_CPUS = 2
BATCH_SIZE = 32

SAMPLE_SIZE = int(1*1e3)
SEQUENCE_LEN = 128


# write mnist data as tfrecords by multiprocessing 
def _convert_to_1d_list(x: Union[np.ndarray, List]):
    if isinstance(x, np.ndarray):
        return x.flatten().tolist()
    elif isinstance(x, list):
        return np.array(x).flatten().tolist()
    else:
        return [x]


def convert_pandas_to_tfrecod(
    feature_array: np.ndarray, 
    label_array: np.ndarray, 
    file_index: int, 
    file_pattern: str, 
    index_pattern: str
):
    tfrecord_path = file_pattern.format(file_index)
    index_path = index_pattern.format(file_index)
    writer = tfrecord.TFRecordWriter(tfrecord_path)
    
    for f,l in zip(feature_array, label_array): 
        # write tfrecord files by tfrecord library 
        row_record = {"features": (_convert_to_1d_list(f), "int"),
                     "label": (_convert_to_1d_list(l), "int")}
        writer.write(row_record)

    writer.close()
    subprocess.call(["python3", "-m", "tfrecord.tools.tfrecord2idx", tfrecord_path, index_path.format(file_index)])

    
def convert_pandas_to_tfrecod_by_multiprocess(
    feature_array: np.ndarray, label_array: np.ndarray, 
    base_path: str, num_workers: int
):
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    tfrecord_pattern = os.path.join(base_path, "data_partition_{}.tfrecord")
    index_pattern = os.path.join(base_path, "data_index_{}.index")

    args_ls = [[feature_partition, label_partition, i, tfrecord_pattern, index_pattern] 
        for i,(feature_partition, label_partition) in 
        enumerate(zip(np.array_split(feature_array, num_workers), 
                        np.array_split(label_array, num_workers)))]
    pool = Pool(processes=num_workers) 
    pool.starmap(convert_pandas_to_tfrecod, args_ls)


def make_fake_data():
    np.random.seed(1)
    features = np.random.randint(low=1100, high=3000, size=(SAMPLE_SIZE, SEQUENCE_LEN-2))
    features = np.hstack([np.ones(shape=(SAMPLE_SIZE,1), dtype=np.int64)*101, features, np.ones(shape=(SAMPLE_SIZE,1), dtype=np.int64)*102])
    labels = np.random.choice([0,1,2,3,4], size=(SAMPLE_SIZE))
    return features, labels


def write_data(features, labels):
    with open(f"{DATA_DIR}/features.npy", "wb") as f:
        np.save(f, features)    
    with open(f"{DATA_DIR}/labels.npy", "wb") as f:
        np.save(f, labels)


def read_data():
    with open(f"{DATA_DIR}/features.npy", "rb") as f:
        features = np.load(f"{DATA_DIR}/features.npy", allow_pickle=True)
    with open(f"{DATA_DIR}/labels.npy", "rb") as f:
        labels = np.load(f"{DATA_DIR}/labels.npy", allow_pickle=True)
    return features, labels



# write the tfrecords
fake_features, fake_labels = make_fake_data()
convert_pandas_to_tfrecod_by_multiprocess(fake_features, fake_labels, DATA_DIR, N_CPUS)
write_data(fake_features, fake_labels)


class FakeDataset(Dataset):

    def __init__(self):
        features, labels = read_data()
        self.x_train=torch.from_numpy(features)
        self.y_train=torch.from_numpy(labels)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]



########################    DALI related pipeline    ########################

from nvidia.dali import Pipeline
import nvidia.dali.tfrecord as tfrec
from nvidia.dali.plugin.pytorch import DALIGenericIterator
from nvidia.dali.plugin.pytorch import LastBatchPolicy

import nvidia.dali.fn as fn
import nvidia.dali.types as types
# import nvidia.dali.ops as ops
from nvidia.dali.fn import readers

import glob 
import subprocess
import os 


class TFRecordPipeline(Pipeline):
    def __init__(self, batch_size, num_threads, shard_id, num_shards, device_id=None):
        super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id)
        self.input = readers.tfrecord(
            path=sorted(glob.glob(f'{DATA_DIR}/*.tfrecord')),
            index_path=sorted(glob.glob(f'{DATA_DIR}/*.index')),
            features={
                "features": tfrec.VarLenFeature(tfrec.int64, 0),
                "label": tfrec.VarLenFeature(tfrec.int64, 0)},
            name='tfrecord_reader',
            num_shards=num_shards,
            shard_id=shard_id,
            # shard_id=Pipeline.current().device_id,
            pad_last_batch=False, 
            random_shuffle=False,
            seed=1,
            stick_to_shard=True  
        )

    def define_graph(self):
        return (fn.reshape(self.input["features"], [SEQUENCE_LEN]), fn.reshape(self.input["label"], [-1,1]))


# example 1 dataloader 
def make_dali_dataloader():
    cuda_available = torch.cuda.device_count() > 0
    pipe = []
    for i in range(max(N_GPUS,1)):
        _pipe = TFRecordPipeline(batch_size=BATCH_SIZE, num_threads=max(1,N_CPUS//N_GPUS if N_GPUS!=0 else 0), 
                                shard_id=i, num_shards=N_GPUS, device_id=i if cuda_available else None)
        _pipe.build()
        pipe.append(_pipe)

    dali_iter = DALIGenericIterator(
        pipe, output_map=["features", "label"], 
        reader_name='tfrecord_reader', auto_reset=True)
    
    return dali_iter


# example 2 dataloader 
# def make_dali_dataloader():
#     cuda_available = torch.cuda.device_count() > 0
#     pipe = []
#     n_shards = 3
#     for i in range(n_shards):
#         _pipe = TFRecordPipeline(batch_size=BATCH_SIZE, num_threads=max(1,N_CPUS//N_GPUS if N_GPUS!=0 else 0), 
#                                 shard_id=i, num_shards=n_shards, device_id=None #i if cuda_available else None
#                                 )
#         _pipe.build()
#         pipe.append(_pipe)

#     dali_iter = DALIGenericIterator(
#         pipe, output_map=["features", "label"], 
#         reader_name='tfrecord_reader', auto_reset=True)
    
#     return dali_iter


# dali_dataloader = make_dali_dataloader()
# for d in dali_dataloader:
# # x is a list of Dict[str, torch.Tensor] where list length = number of shards
# 	x = d 
# 	break 



class TestMode(LightningModule):

    def __init__(self, use_dali):
        super().__init__()
        self.check_count = 0

        self.use_dali = use_dali
        from transformers import AutoModelForSequenceClassification

        self.bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
        
    def forward(self, **input):        
        return self.bert(**input)

    def process_batch(self, batch):
    	# should global_rank be provided in training?
        x = batch[self.global_rank]["features"]
        y = batch[self.global_rank]["label"].reshape(-1).type(torch.long)
        return x,y 

    def training_step(self, batch, batch_idx):
        if self.use_dali:
            x, y = self.process_batch(batch)
        else:
            x, y = batch 

        self.check_count += x.shape[0]
        output = self(**{"input_ids": x, "labels":y})
        return output.loss

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-3)

    def train_dataloader(self):
        if self.use_dali:
            return dali_data_loader
        else:
            return native_torch_dataloader


from pytorch_lightning.loggers import WandbLogger


USE_DALI = True 
# wandb_logger = WandbLogger(project="abuse_memory_dali_dataloader" if USE_DALI else "abuse_memory_native_pytorch_dataloader")

if USE_DALI:
    dali_data_loader = make_dali_dataloader()
else:
    fake_ds = FakeDataset()
    native_torch_dataloader = DataLoader(fake_ds, batch_size=BATCH_SIZE, num_workers=N_CPUS, pin_memory=True)


# train with multiple GPUs
model = TestMode(use_dali=USE_DALI)
trainer = Trainer(
    gpus=torch.cuda.device_count(), 
    distributed_backend="ddp",
    min_epochs=1,
    max_epochs=1,
)
s = time.time()
trainer.fit(model)
e = time.time()
print (f"USE_DALI={USE_DALI} took {e-s}, check_count={model.check_count}")
print (f"local_rank = {trainer.local_rank}, global_rank={trainer.global_rank}, world_size={trainer.world_size}")

Thank you very much for reading my questions!

tian-yi-zefr · 2022-04-18T22:45:03Z

tian-yi-zefr
Apr 18, 2022
Author

one working example by changing the pipeline a little bit, it looks like the epoch in dali dataloader is different from pytorch dataloader,
one got check_count=125 and expected as total training sample is 1k, each gpu got 1000/8=125 samples. the dali dataloader seems to repeat each epoch 8 times, what is wrong in my pipeline?

import torch
from torch.nn import functional as F
from torch import nn
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer
from torch.optim import Adam
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

import tfrecord 
import numpy as np 
from multiprocessing import Pool, cpu_count
import subprocess
from typing import Dict, List, Tuple, Union

import os
import pandas as pd 
import time 


DATA_DIR = './temp'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

N_GPUS = torch.cuda.device_count() 
# N_GPUS = 2
print (f"N_GPUS = {N_GPUS}")
N_CPUS = cpu_count()
# N_CPUS = 2
BATCH_SIZE = 32

SAMPLE_SIZE = int(1*1e3)
SEQUENCE_LEN = 128


# write mnist data as tfrecords by multiprocessing 
def _convert_to_1d_list(x: Union[np.ndarray, List]):
    if isinstance(x, np.ndarray):
        return x.flatten().tolist()
    elif isinstance(x, list):
        return np.array(x).flatten().tolist()
    else:
        return [x]


def convert_pandas_to_tfrecod(
    feature_array: np.ndarray, 
    label_array: np.ndarray, 
    file_index: int, 
    file_pattern: str, 
    index_pattern: str
):
    tfrecord_path = file_pattern.format(file_index)
    index_path = index_pattern.format(file_index)
    writer = tfrecord.TFRecordWriter(tfrecord_path)
    
    for f,l in zip(feature_array, label_array): 
        # write tfrecord files by tfrecord library 
        row_record = {"features": (_convert_to_1d_list(f), "int"),
                     "label": (_convert_to_1d_list(l), "int")}
        writer.write(row_record)

    writer.close()
    subprocess.call(["python3", "-m", "tfrecord.tools.tfrecord2idx", tfrecord_path, index_path.format(file_index)])

    
def convert_pandas_to_tfrecod_by_multiprocess(
    feature_array: np.ndarray, label_array: np.ndarray, 
    base_path: str, num_workers: int
):
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    tfrecord_pattern = os.path.join(base_path, "data_partition_{}.tfrecord")
    index_pattern = os.path.join(base_path, "data_index_{}.index")

    args_ls = [[feature_partition, label_partition, i, tfrecord_pattern, index_pattern] 
        for i,(feature_partition, label_partition) in 
        enumerate(zip(np.array_split(feature_array, num_workers), 
                        np.array_split(label_array, num_workers)))]
    pool = Pool(processes=num_workers) 
    pool.starmap(convert_pandas_to_tfrecod, args_ls)


def make_fake_data():
    np.random.seed(1)
    features = np.random.randint(low=1100, high=3000, size=(SAMPLE_SIZE, SEQUENCE_LEN-2))
    features = np.hstack([np.ones(shape=(SAMPLE_SIZE,1), dtype=np.int64)*101, features, np.ones(shape=(SAMPLE_SIZE,1), dtype=np.int64)*102])
    labels = np.random.choice([0,1,2,3,4], size=(SAMPLE_SIZE))
    return features, labels


def write_data(features, labels):
    with open(f"{DATA_DIR}/features.npy", "wb") as f:
        np.save(f, features)    
    with open(f"{DATA_DIR}/labels.npy", "wb") as f:
        np.save(f, labels)


def read_data():
    with open(f"{DATA_DIR}/features.npy", "rb") as f:
        features = np.load(f"{DATA_DIR}/features.npy", allow_pickle=True)
    with open(f"{DATA_DIR}/labels.npy", "rb") as f:
        labels = np.load(f"{DATA_DIR}/labels.npy", allow_pickle=True)
    return features, labels


'''
# write the datasets
fake_features, fake_labels = make_fake_data()
convert_pandas_to_tfrecod_by_multiprocess(fake_features, fake_labels, DATA_DIR, N_CPUS)
write_data(fake_features, fake_labels)
'''

class FakeDataset(Dataset):

    def __init__(self):
        features, labels = read_data()
        self.x_train=torch.from_numpy(features)
        self.y_train=torch.from_numpy(labels)

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, idx):
        return self.x_train[idx], self.y_train[idx]



########################    DALI related pipeline    ########################

from nvidia.dali import Pipeline
import nvidia.dali.tfrecord as tfrec
from nvidia.dali.plugin.pytorch import DALIGenericIterator
from nvidia.dali.plugin.pytorch import LastBatchPolicy

import nvidia.dali.fn as fn
import nvidia.dali.types as types
# import nvidia.dali.ops as ops
from nvidia.dali.fn import readers

import glob 
import subprocess
import os 


class TFRecordPipeline(Pipeline):
    def __init__(self, batch_size, num_threads, shard_id, num_shards, device_id=None):
        super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id)
        self.input = readers.tfrecord(
            path=sorted(glob.glob(f'{DATA_DIR}/*.tfrecord')),
            index_path=sorted(glob.glob(f'{DATA_DIR}/*.index')),
            features={
                "features": tfrec.VarLenFeature(tfrec.int64, 0),
                "label": tfrec.VarLenFeature(tfrec.int64, 0)},
            name='tfrecord_reader',
            num_shards=num_shards,
            shard_id=shard_id,
            # shard_id=Pipeline.current().device_id,
            pad_last_batch=False, 
            random_shuffle=False,
            seed=1,
            stick_to_shard=True  
        )

    def define_graph(self):
        return (fn.reshape(self.input["features"], [SEQUENCE_LEN]), fn.reshape(self.input["label"], [-1,1]))


# example 4 dataloader 
def make_dali_dataloader(batch_size, shard_id, num_shards, device_id):
    pipe = TFRecordPipeline(batch_size=batch_size, num_threads=8, 
                            shard_id=shard_id, num_shards=num_shards, 
                            device_id=device_id)
    pipe.build()

    dali_iter = DALIGenericIterator(
        pipe, output_map=["features", "label"], 
        reader_name='tfrecord_reader', auto_reset=True)
    
    return dali_iter



class TestMode(LightningModule):

    def __init__(self, use_dali):
        super().__init__()
        self.check_count = 0

        self.use_dali = use_dali
        from transformers import AutoModelForSequenceClassification

        self.bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
        

    def setup(self, stage=None):
        device_id = self.local_rank
        shard_id = self.global_rank
        num_shards = self.trainer.world_size
        dali_dataloader = make_dali_dataloader(BATCH_SIZE, shard_id, num_shards, device_id)
        self.dali_data_loader = dali_dataloader

    def forward(self, **input):        
        return self.bert(**input)

    def process_batch(self, batch):
        # should global_rank be provided in training?
        x = batch[0]["features"]
        y = batch[0]["label"].reshape(-1).type(torch.long)
        return x,y 

    def training_step(self, batch, batch_idx):
        if self.use_dali:
            x, y = self.process_batch(batch)
        else:
            x, y = batch 

        self.check_count += x.shape[0]
        output = self(**{"input_ids": x, "labels":y})
        return output.loss

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-3)

    def train_dataloader(self):
        if self.use_dali:
            return self.dali_data_loader
            # return dali_data_loader
        else:
            return native_torch_dataloader


from pytorch_lightning.loggers import WandbLogger


USE_DALI = True 
# wandb_logger = WandbLogger(project="abuse_memory_dali_dataloader" if USE_DALI else "abuse_memory_native_pytorch_dataloader")

if USE_DALI:
    # dali_data_loader = make_dali_dataloader()
    print ("using dali dataloader")
else:
    fake_ds = FakeDataset()
    native_torch_dataloader = DataLoader(fake_ds, batch_size=BATCH_SIZE, num_workers=N_CPUS, pin_memory=True)


# train with multiple GPUs
model = TestMode(use_dali=USE_DALI)
trainer = Trainer(
    gpus=torch.cuda.device_count(), 
    distributed_backend="ddp",
    min_epochs=1,
    max_epochs=1,
)
s = time.time()
trainer.fit(model)
e = time.time()
print (f"USE_DALI={USE_DALI} took {e-s}, check_count={model.check_count}")
print (f"local_rank = {trainer.local_rank}, global_rank={trainer.global_rank}, world_size={trainer.world_size}")

"""
pytorch dataloader output:
USE_DALI=False took 17.93775177001953, check_count=125
local_rank = 7, global_rank=7, world_size=8
USE_DALI=False took 29.35999846458435, check_count=125
local_rank = 3, global_rank=3, world_size=8
USE_DALI=False took 31.835626363754272, check_count=125
local_rank = 2, global_rank=2, world_size=8
USE_DALI=False took 28.331931352615356, check_count=125
local_rank = 4, global_rank=4, world_size=8
USE_DALI=False took 22.039965629577637, check_count=125
local_rank = 6, global_rank=6, world_size=8
USE_DALI=False took 24.94156527519226, check_count=125
local_rank = 5, global_rank=5, world_size=8
USE_DALI=False took 33.47015905380249, check_count=125
local_rank = 1, global_rank=1, world_size=8
USE_DALI=False took 44.505956172943115, check_count=125
local_rank = 0, global_rank=0, world_size=8

dali dataloader output:
USE_DALI=True took 26.9411404132843, check_count=1024
local_rank = 4, global_rank=4, world_size=8
USE_DALI=True took 18.083521842956543, check_count=1024
local_rank = 7, global_rank=7, world_size=8
USE_DALI=True took 31.162879467010498, check_count=1024
local_rank = 3, global_rank=3, world_size=8
USE_DALI=True took 36.640870332717896, check_count=1024
local_rank = 1, global_rank=1, world_size=8
USE_DALI=True took 20.84022569656372, check_count=1024
local_rank = 6, global_rank=6, world_size=8
USE_DALI=True took 34.858782052993774, check_count=1024
local_rank = 2, global_rank=2, world_size=8
USE_DALI=True took 23.41780161857605, check_count=1024
local_rank = 5, global_rank=5, world_size=8
USE_DALI=True took 47.536590814590454, check_count=1024
local_rank = 0, global_rank=0, world_size=8

"""

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

dali tfrecord pipeline sharding on multi GPU environment #12792

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

dali tfrecord pipeline sharding on multi GPU environment #12792

Uh oh!

Uh oh!

tian-yi-zefr Apr 18, 2022

Replies: 1 comment

Uh oh!

tian-yi-zefr Apr 18, 2022 Author

tian-yi-zefr
Apr 18, 2022

tian-yi-zefr
Apr 18, 2022
Author