How to make a dataset in data list way? #5408

Kevin-shihello-world · 2022-09-10T05:00:26Z

Kevin-shihello-world
Sep 10, 2022

I wanted to make a dataset about GNN's training parameters on training to get further learning of GNN's pattern changing while training. For easy to perform the experiment I change the example code of planetoid and use read_myds_data to load data in Tensor form from .npy documents and transform them to data one by one to form a data list(it seems most graph datasets got information like node_features in one document and use an identity txt to separate them).
I use the read_myds_data function in this feature

from typing import Optional, Callable, List
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, download_url
from torch_geometric.io import read_planetoid_data

###假设原数据被以tensor形式保存为.npy格式
##tensor to txt:
##import scipy.io as io
##result1 = np.array(result1)
##np.savetxt('npresult1.txt',result1)

from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.data import Data

def read_myds_data(raw_dir_0, raw_dir_1, raw_dir_2,i):
    ##now raw_dir_2 for my_file.npy
    data=[]
    feats = torch.load('/netp/test_1/' + str(i)+"_"+'my_file.npy' )
    i__ = 0
    
    feats_n = torch.load(raw_dir_1 + str(i) + raw_dir_2)
    graphs = []
    list_1 = []
    list_2 = []
    
    list_3 = []
    list_4 = []
    x = torch.ones([32 ,128], dtype=torch.float)
    h = 33##h:number for nodes ;32 for node of conv1.lin_rel.bias while 
        ##0-31 for extracted features for classify and 33 for the first node for conv1.lin_rel.weight 
    h_r = []
   ## h_1 = 3*feats['conv1.lin_rel.weight'].shape[1]##h_1 for edges
    ###
    
    x = feats['net']['conv1.lin.weight'].t()
    x = x.cuda
    print(feats['net']['conv1.lin.weight'].t().shape[0])
    for i_1 in range(0, feats['net']['conv1.lin.weight'].t().shape[0]):
        ##for here feats[name_0].shape[1] would always be the same as feats[name_2].shape[1] I deal with this 2 in one for loop
        list_1.append(h + i_1)
        list_2.append(h - 1)
    c = torch.Tensor([list_1,list_2]).t()  
    c_1 = torch.Tensor([list_2,list_1]).t() 
    edge_list = torch.cat((c, c_1), 1)
    """
    for i in range(1,4):
        name_0 = 'conv'+ str(i) +'.lin_rel.weight'
        name_1 = "conv"+ str(i) +".lin_rel.bias"
        name_2 = "conv"+ str(i) +".lin_root.weight"
        name_3 = "pool"+ str(i) +".weight"
        
        x = torch.cat((x, feats[str(name_1)].unsqueeze(1).t()), 0)
        x = torch.cat((x, torch.transpose(feats[name_0], 0, 1)), 0)
        x = torch.cat((x, feats[name_2].t()), 0)
        x = torch.cat((x, feats[name_3].t()), 0)
                    
        x_1 = torch.cat((x_1, feats[name_1].unsqueeze(1).t()), 0)
        x_1 = torch.cat((x_1, torch.transpose(feats[name_0], 0, 1)), 0)
        x_1 = torch.cat((x_1, feats[name_2].t()), 0)
        x_1 = torch.cat((x_1, feats[name_3].t()), 0)

        for i_1 in range(0, feats[name_0].shape[1]):
            ##for here feats[name_0].shape[1] would always be the same as feats[name_2].shape[1] I deal with this 2 in one for loop
            list_1[i_1] = h + i_1
            list_2[i_1] = h - 1
            
            list_1[feats[name_0].shape[1] + i_1] = h + feats[name_0].shape[1] + i_1##for nodes of name_2
            list_2[feats[name_0].shape[1] + i_1] = h + i_1
            
            list_1[feats[name_0].shape[1] + feats[name_2].shape[1] + i_1] = h + feats[name_0].shape[1] + i_1##for nodes of name_2
            list_2[feats[name_0].shape[1] + feats[name_2].shape[1] + i_1] = h + 2*feats[name_0].shape[1]##h + 2*feats[name_0].shape[1] for feats[name_3]
        
        edge_list_0 = torch.Tensor([list_0,list_1]).t()  
        
        if i != 3:  
            z = torch.transpose(feats[name_0], 0, 1)
            Re = nn.ReLU()
            edge_list_1 = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
            edge_list_1[0] = edge_list_1[0] + h*torch.new_ones(edge_list_1[0])##for name_0 
            edge_list_1[1] = edge_list_1[1] + (h + 2*feats[name_0].shape[1])*torch.new_ones(edge_list_1[1])##for next name_0 
            ##or lin1.weight
        else:
            z = torch.transpose(feats[name_0], 0, 1)
            Re = nn.ReLU()
            edge_list_1_ = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
            ##h_a = torch.arange(1, 128)
            adder_1 = torch.new_ones([1, 128])
            adder_2 = torch.arange(1, 128)
            adder = torch.cat((h_r[2]*adder_1, h_r[3]*adder_1), 1)
            edge_list_1_[0] = edge_list_1_[0] + adder
            edge_list_1_[1] = edge_list_1_[1] + (h + 2*feats[name_0].shape[1])*torch.new_ones(edge_list_1_[1]) 
            
            ##so now I write connections to lin1.bias
            edge_list_1 = torch.cat((edge_list_1_[1], (h + 3*feats[name_0].shape[1])*torch.new_ones(edge_list_1_[1])), 0)
            edge_list_1 = torch.cat((edge_list_1_, edge_list_1), 1)
        h_r[i] = h
        h = h + 3*feats[names_0].shape[1]
        
        edge_list = torch.cat((edge_list, edge_list_0, edge_list_1), 1)
        ###
        z = torch.transpose(feats["lin1.weight"], 0, 1)
        Re = nn.ReLU()
        edge_list_1_ = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
        edge_list_1_[0] = edge_list_1_[0] + h*torch.new_ones(edge_list_1_[0])
        edge_list_1_[1] = edge_list_1_[1] + (h + feats["lin1.weight"].shape[1])*torch.new_ones(edge_list_1_[1])
        edge_list_1 = torch.cat((edge_list_1_[1], (h + 2*feats["lin1.weight"].shape[1])*torch.new_ones(edge_list_1_[1])), 0)
        edge_list_1 = torch.cat((edge_list_1_, edge_list_1), 1)
        edge_list = torch.cat((edge_list, edge_list_0, edge_list_1), 1)
    
    """
    
    ###
    feats_L = torch.load('/netp/test_1_/' + str(i_1)+"_"+'my_file.npy')
    loss = feats_L['loss'] 
    y = loss
    ##data=Data(x=x,edge_index=edge_list,y = y) #包装成Data类
    return Data(x=x,edge_index=edge_list,y = y) #包装成Data类
class Myds(InMemoryDataset):

    ##url = 'https://github.com/kimiyoung/planetoid/raw/master/data'
    def __init__(self, root: str, name: str):
                 ##, split: str = "public",
                 ##num_train_per_class: int = 20, num_val: int = 500,
                 ##num_test: int = 1000, transform: Optional[Callable] = None,
                 ##pre_transform: Optional[Callable] = None):
        self.name = name

        super().__init__(root)##, transform, pre_transform = None, pre_filter = None) #等同于super(Planetoid,self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])#InMemoryDataset继承于Dataset类，processed_paths是Dataset类的属性，有self.processed_dir拼接得到
        ###
        self.data, self.slices = torch.load(self.processed_paths[0])#InMemoryDataset继承于Dataset类，processed_paths是Dataset类的属性，有self.processed_dir拼接得到
        
        ###
        '''
        # 将数据集划分为训练、验证和测试集
        self.split = split
        assert self.split in ['public', 'full', 'random']

        if split == 'full':
            data = self.get(0)
            data.train_mask.fill_(True)
            data.train_mask[data.val_mask | data.test_mask] = False
            self.data, self.slices = self.collate([data])

        elif split == 'random':
            data = self.get(0)
            data.train_mask.fill_(False)
            for c in range(self.num_classes):
                idx = (data.y == c).nonzero(as_tuple=False).view(-1)
                idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
                data.train_mask[idx] = True

            remaining = (~data.train_mask).nonzero(as_tuple=False).view(-1)
            remaining = remaining[torch.randperm(remaining.size(0))]

            data.val_mask.fill_(False)
            data.val_mask[remaining[:num_val]] = True

            data.test_mask.fill_(False)
            data.test_mask[remaining[num_val:num_val + num_test]] = True

            self.data, self.slices = self.collate([data])
            '''

    @property
    def raw_dir(self) -> str:
        return '/netp/test_1/'

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, self.name, 'processed')

    @property
    def raw_file_names(self) -> List[str]:
        ##names = 
        ##return [f'ind.{self.name.lower()}.{name}' for name in names]
        return ['/netp/test_1/', 'netp/test_1_/', 'my_file.npy']
    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        pass

    def process(self):
        ###
        # Read data into huge `Data` list.
        data_list = []
        i = 0
        for i in range(1,10):
            data_list += read_myds_data('/netp/test_1/', 'netp/test_1_/', 'my_file.npy', i)#调用函数读取数据，包装成Data

        data, slices = self.collate(data_list)
        data = data if self.pre_transform is None else self.pre_transform(data)
        torch.save((data, slices), self.processed_paths[0])
        ###

    def __repr__(self) -> str:
        return f'{self.name}()'

And the compiler warned me in this way :
/tmp/ipykernel_100/359689239.py in process(self)
192 data_list += read_myds_data('/netp/test_1/', 'netp/test_1_/', 'my_file.npy', i)#调用函数读取数据，包装成Data
193
--> 194 data, slices = self.collate(data_list)
195 data = data if self.pre_transform is None else self.pre_transform(data)
196 torch.save((data, slices), self.processed_paths[0])

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/data/in_memory_dataset.py in collate(data_list)
103 return data_list[0], None
104
--> 105 data, slices, _ = collate(
106 data_list[0].class,
107 data_list=data_list,

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/data/collate.py in collate(cls, data_list, increment, add_batch, follow_batch, exclude_keys)
35
36 # Create empty stores:
---> 37 out.stores_as(data_list[0])
38
39 follow_batch = set(follow_batch or [])

AttributeError: 'tuple' object has no attribute 'stores_as'

Thanks for your help!

Answered by rusty1s

Sep 15, 2022

Note that raw_dir needs to be a string, not a set/list of strings:

@property
def raw_dir(self) -> str:
    return '/netp/test_1'

View full answer

EdisonLeeeee · 2022-09-11T15:08:02Z

EdisonLeeeee
Sep 11, 2022
Collaborator

Hi @Kevin-shihello-world
Could you please make it clear? I am sorry I cannot get your point based on your code snippet. It would be appreciated if you could provide a minimal example.

6 replies

Kevin-shihello-world Sep 18, 2022
Author

ok , thank you and rust1s , I now get it that '' may be judged as a character rather than string hide my face.jpg

Kevin-shihello-world Sep 18, 2022
Author

By the way, I also want to know how the gcnconv in PYG works. I mean it says in the document that it use pyg.Messagepassing.aggregate function to transport information through edges in graph and the code work like that:
row, col = edge_index
# row(12431), x_j(12431, out_channels)
out = scatter(x_j, row, dim=0, reduce='sum')
but how can we transport information when we only give row list of edge_index to the function dealing with the transportation problem?
And I still do not get how torch_scatter.scatter_sum works there exist too little blogs about that and even though I did experiments I still do not understand it wholely .
And I also wonder why it seems PYG deal all gcnconv like messagepassing way I mean I remember gcns are kind of spectral GNN so they are supposed to do the thing like decomposing a laplacian matrix.
I think it would help a lot of people if the answers to these questions. So I would appreciate if you can help me get some ideas to it. Thanks.
best wishes.

Kevin-shihello-world Sep 19, 2022
Author

By the way, your GreatX work looks cool

EdisonLeeeee Sep 19, 2022
Collaborator

Thanks : )

EdisonLeeeee Sep 19, 2022
Collaborator

Per your questions:

how can we transport information when we only give row list of edge_index to the function dealing with the transportation problem?

The message passing consists of two steps in PyG: gather and scatter. Actually, x_j=x[col] is the gather step and represents the neighborhood messages (features) in your example. In scatter step, it aggregates (or somehow you can understand it as merge) these messages grouped by source nodes row with different operations (e.g., sum). Here row and col can be exchanged alternatively.

And I still do not get how torch_scatter.scatter_sum works there exist too few blogs about that and even though I did experiments I still do not understand it wholely.

You can go through this slide and figure out how it works in PyG.

And I also wonder why PYG was created to deal with all gcnconv in messagepassing way I mean I remember gcns are kind of spectral GNN so they are supposed to do the thing like decomposing a laplacian matrix.

The difference between spectral GNNs and spatial GNNs lies more at the original motivation. If a GNN is motivated by spectral graph theory, people call it a spectral one. This does not mean that we cannot formulate a spectral GNN from the spatial perspective, vice versa. In PyG, GCN is implemented as a spatial one for better efficiency.

rusty1s · 2022-09-15T09:46:26Z

rusty1s
Sep 15, 2022
Maintainer

Note that raw_dir needs to be a string, not a set/list of strings:

@property
def raw_dir(self) -> str:
    return '/netp/test_1'

20 replies

Kevin-shihello-world Sep 24, 2022
Author

Thanks, I found it was that I found the dataset.py in PYG has some functions decorated by @Property so I thought they must be functions must contain in the dataset class inherited the prototype of the dataset in PYG so I added them in and now it turned out to be no need to rewrite them all for they are already in the father class. I'm sorry for your time covering my face.jpg it just sometimes when you are on it it becomes quite confusing. And I am now in a len of dataset problem, it warns me like this:

python
ZeroDivisionError                         Traceback (most recent call last)
/tmp/ipykernel_96/1362167373.py in <cell line: 132>()
    131 
    132 for epoch in range(1, 10):
--> 133     train_loss = train(train_loader)
    134     _, train_acc = test(train_loader)
    135     val_loss, val_acc = test(val_loader)

/tmp/ipykernel_96/1362167373.py in train(train_loader)
    111             i = 0
    112         i_1 = i_1 + 1
--> 113     return loss_all / len(train_dataset)
    114 
    115 

ZeroDivisionError: division by zero

while I loaded it in this way:

python

dataset = Myds("/netp", "try_")
dataset = dataset.shuffle()
n = len(dataset) // 10
test_dataset = dataset[:n]
train_dataset = dataset[n:]
test_loader = DataLoader(test_dataset, batch_size=10)
train_loader = DataLoader(train_dataset, batch_size=17)
###
###avg_num_nodes = int(dataset.data.x.size(0) / len(dataset))
avg_num_nodes = int(len(dataset[0][0]) / len(dataset))
n = (len(dataset) + 9) // 10
test_dataset = dataset[:n]
val_dataset = dataset[n:2 * n]
train_dataset = dataset[2 * n:]
test_loader = DataLoader(test_dataset, batch_size=20)
val_loader = DataLoader(val_dataset, batch_size=20)
train_loader = DataLoader(train_dataset, batch_size=20)

And the 'len(dataset)'sentence told me I've got 27 data in my dataset so I thought it must be I divided my dataset wrongly, but how to understand division like ' dataset[:n]'.It may be easy to answer but I just do not get the point. Thank you.

EdisonLeeeee Sep 24, 2022
Collaborator

Your code to split datasets looks fine and I have no idea about your issue. Sorry.
BTW, you should divide the loss by len(train_loader):

loss_all / len(train_loader)

Kevin-shihello-world Sep 25, 2022
Author

Thanks, though it didn't solve my question. So I deleted len(train_loader) and it warns me like this and I think this may help us get deeper information on this question it may be just I used '[]' instead '()':

python
 TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_92/3373408995.py in <cell line: 133>()
    135     _, train_acc = test(train_loader)
    136     val_loss, val_acc = test(val_loader)
--> 137     test_loss, test_acc = test(test_loader)
    138     print(f'Epoch: {epoch:03d}, Train Loss: {train_loss:.3f}, '
    139           f'Train Acc: {train_acc:.3f}, Val Loss: {val_loss:.3f}, '

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
     25         def decorate_context(*args, **kwargs):
     26             with self.clone():
---> 27                 return func(*args, **kwargs)
     28         return cast(F, decorate_context)
     29 

/tmp/ipykernel_92/3373408995.py in test(loader)
    121     loss_all = 0
    122 
--> 123     for data in loader:
    124         data = data.to(device)
    125         pred, tot_loss, g_g, g_e = model(data.x, data.edge_index, data.batch)

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch/utils/data/dataloader.py in __next__(self)
    528             if self._sampler_iter is None:
    529                 self._reset()
--> 530             data = self._next_data()
    531             self._num_yielded += 1
    532             if self._dataset_kind == _DatasetKind.Iterable and \

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    568     def _next_data(self):
    569         index = self._next_index()  # may raise StopIteration
--> 570         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    571         if self._pin_memory:
    572             data = _utils.pin_memory.pin_memory(data)

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     50         else:
     51             data = self.dataset[possibly_batched_index]
---> 52         return self.collate_fn(data)

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in __call__(self, batch)
     32             return type(elem)(*(self(s) for s in zip(*batch)))
     33         elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34             return [self(s) for s in zip(*batch)]
     35 
     36         raise TypeError(f'DataLoader found invalid type: {type(elem)}')

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in <listcomp>(.0)
     32             return type(elem)(*(self(s) for s in zip(*batch)))
     33         elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34             return [self(s) for s in zip(*batch)]
     35 
     36         raise TypeError(f'DataLoader found invalid type: {type(elem)}')

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in __call__(self, batch)
     32             return type(elem)(*(self(s) for s in zip(*batch)))
     33         elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34             return [self(s) for s in zip(*batch)]
     35 
     36         raise TypeError(f'DataLoader found invalid type: {type(elem)}')

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in <listcomp>(.0)
     32             return type(elem)(*(self(s) for s in zip(*batch)))
     33         elif isinstance(elem, Sequence) and not isinstance(elem, str):
---> 34             return [self(s) for s in zip(*batch)]
     35 
     36         raise TypeError(f'DataLoader found invalid type: {type(elem)}')

~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/loader/dataloader.py in __call__(self, batch)
     34             return [self(s) for s in zip(*batch)]
     35 
---> 36         raise TypeError(f'DataLoader found invalid type: {type(elem)}')
     37 
     38     def collate(self, batch):  # Deprecated...

TypeError: DataLoader found invalid type: <class 'builtin_function_or_method'>

I searched for it and on the internet, it shows me its use '[]'instead '()' problem so I wrote
'dataset = Myds("/netp", "try_")
dataset[0]'
and it told me dataset[0] is in '[]' character so should I find a way to eliminate '[]'?What should I do? I would appreciate it if someone could show me an answer.

Kevin-shihello-world Sep 29, 2022
Author

Well, it seems I let this question be too general. Building a dataset is a quite big thing. So how about I mark the answer to my first request as the answer to this question and ask another question as to how to build a dataset in a data list way (2)? And I also wonder if I can just use a data list without using an in-memory dataset. I mean the dataset to analyze the training progress of a GNN may contain 20 graphs with 516 points and 128 features each, I thought it may be too much data for the load without the dataset way for those data in a singer data list would be in GPU, so how many data can we load in data list way without dataset?
Thank you all, I am just an undergraduate student and your word did let me learn a lot, both in pyg knowledge and how to discover the uses of programs developed by other people and I think those words may also inspire the following people who want to develop datasets in in-memory datasets with data extracted not from .txt documents. I would mark the answer to my first request as the answer to this question after one of you sent me an acknowledgment. Thank you.

rusty1s Sep 29, 2022
Maintainer

Yeah, you do not necessarily need to use the Dataset helper. Holding your data objects as part of a list works as well - and you can directly input that into the DataLoader:

loader = DataLoader([data1, data2, data3, data4, ...], ...)

If you still want to convert your list of data objects into a dataset, you can also do

class MyDataset(InMemoryDataset):
    def __init__(self, data_list, transform=None):
         super().__init__(root_dir=None, transform=transform)
         self.data, self.slices = self.collate(data_list)

dataset = MyDataset(data_list)

How to make a dataset in data list way? #5408

Uh oh!

Uh oh!

Kevin-shihello-world Sep 10, 2022

Replies: 2 comments · 26 replies

Uh oh!

Uh oh!

EdisonLeeeee Sep 11, 2022 Collaborator

Uh oh!

Kevin-shihello-world Sep 18, 2022 Author

Uh oh!

Kevin-shihello-world Sep 18, 2022 Author

Uh oh!

Kevin-shihello-world Sep 19, 2022 Author

Uh oh!

EdisonLeeeee Sep 19, 2022 Collaborator

Uh oh!

EdisonLeeeee Sep 19, 2022 Collaborator

Uh oh!

rusty1s Sep 15, 2022 Maintainer

Uh oh!

Kevin-shihello-world Sep 24, 2022 Author

Uh oh!

EdisonLeeeee Sep 24, 2022 Collaborator

Uh oh!

Kevin-shihello-world Sep 25, 2022 Author

Uh oh!

Kevin-shihello-world Sep 29, 2022 Author

Uh oh!

rusty1s Sep 29, 2022 Maintainer

Kevin-shihello-world
Sep 10, 2022

Replies: 2 comments 26 replies

EdisonLeeeee
Sep 11, 2022
Collaborator

Kevin-shihello-world Sep 18, 2022
Author

Kevin-shihello-world Sep 18, 2022
Author

Kevin-shihello-world Sep 19, 2022
Author

EdisonLeeeee Sep 19, 2022
Collaborator

EdisonLeeeee Sep 19, 2022
Collaborator

rusty1s
Sep 15, 2022
Maintainer

Kevin-shihello-world Sep 24, 2022
Author

EdisonLeeeee Sep 24, 2022
Collaborator

Kevin-shihello-world Sep 25, 2022
Author

Kevin-shihello-world Sep 29, 2022
Author

rusty1s Sep 29, 2022
Maintainer