How to make a dataset in data list way? #5408
-
I wanted to make a dataset about GNN's training parameters on training to get further learning of GNN's pattern changing while training. For easy to perform the experiment I change the example code of planetoid and use read_myds_data to load data in Tensor form from .npy documents and transform them to data one by one to form a data list(it seems most graph datasets got information like node_features in one document and use an identity txt to separate them). from typing import Optional, Callable, List
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, download_url
from torch_geometric.io import read_planetoid_data
###假设原数据被以tensor形式保存为.npy格式
##tensor to txt:
##import scipy.io as io
##result1 = np.array(result1)
##np.savetxt('npresult1.txt',result1)
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.data import Data
def read_myds_data(raw_dir_0, raw_dir_1, raw_dir_2,i):
##now raw_dir_2 for my_file.npy
data=[]
feats = torch.load('/netp/test_1/' + str(i)+"_"+'my_file.npy' )
i__ = 0
feats_n = torch.load(raw_dir_1 + str(i) + raw_dir_2)
graphs = []
list_1 = []
list_2 = []
list_3 = []
list_4 = []
x = torch.ones([32 ,128], dtype=torch.float)
h = 33##h:number for nodes ;32 for node of conv1.lin_rel.bias while
##0-31 for extracted features for classify and 33 for the first node for conv1.lin_rel.weight
h_r = []
## h_1 = 3*feats['conv1.lin_rel.weight'].shape[1]##h_1 for edges
###
x = feats['net']['conv1.lin.weight'].t()
x = x.cuda
print(feats['net']['conv1.lin.weight'].t().shape[0])
for i_1 in range(0, feats['net']['conv1.lin.weight'].t().shape[0]):
##for here feats[name_0].shape[1] would always be the same as feats[name_2].shape[1] I deal with this 2 in one for loop
list_1.append(h + i_1)
list_2.append(h - 1)
c = torch.Tensor([list_1,list_2]).t()
c_1 = torch.Tensor([list_2,list_1]).t()
edge_list = torch.cat((c, c_1), 1)
"""
for i in range(1,4):
name_0 = 'conv'+ str(i) +'.lin_rel.weight'
name_1 = "conv"+ str(i) +".lin_rel.bias"
name_2 = "conv"+ str(i) +".lin_root.weight"
name_3 = "pool"+ str(i) +".weight"
x = torch.cat((x, feats[str(name_1)].unsqueeze(1).t()), 0)
x = torch.cat((x, torch.transpose(feats[name_0], 0, 1)), 0)
x = torch.cat((x, feats[name_2].t()), 0)
x = torch.cat((x, feats[name_3].t()), 0)
x_1 = torch.cat((x_1, feats[name_1].unsqueeze(1).t()), 0)
x_1 = torch.cat((x_1, torch.transpose(feats[name_0], 0, 1)), 0)
x_1 = torch.cat((x_1, feats[name_2].t()), 0)
x_1 = torch.cat((x_1, feats[name_3].t()), 0)
for i_1 in range(0, feats[name_0].shape[1]):
##for here feats[name_0].shape[1] would always be the same as feats[name_2].shape[1] I deal with this 2 in one for loop
list_1[i_1] = h + i_1
list_2[i_1] = h - 1
list_1[feats[name_0].shape[1] + i_1] = h + feats[name_0].shape[1] + i_1##for nodes of name_2
list_2[feats[name_0].shape[1] + i_1] = h + i_1
list_1[feats[name_0].shape[1] + feats[name_2].shape[1] + i_1] = h + feats[name_0].shape[1] + i_1##for nodes of name_2
list_2[feats[name_0].shape[1] + feats[name_2].shape[1] + i_1] = h + 2*feats[name_0].shape[1]##h + 2*feats[name_0].shape[1] for feats[name_3]
edge_list_0 = torch.Tensor([list_0,list_1]).t()
if i != 3:
z = torch.transpose(feats[name_0], 0, 1)
Re = nn.ReLU()
edge_list_1 = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
edge_list_1[0] = edge_list_1[0] + h*torch.new_ones(edge_list_1[0])##for name_0
edge_list_1[1] = edge_list_1[1] + (h + 2*feats[name_0].shape[1])*torch.new_ones(edge_list_1[1])##for next name_0
##or lin1.weight
else:
z = torch.transpose(feats[name_0], 0, 1)
Re = nn.ReLU()
edge_list_1_ = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
##h_a = torch.arange(1, 128)
adder_1 = torch.new_ones([1, 128])
adder_2 = torch.arange(1, 128)
adder = torch.cat((h_r[2]*adder_1, h_r[3]*adder_1), 1)
edge_list_1_[0] = edge_list_1_[0] + adder
edge_list_1_[1] = edge_list_1_[1] + (h + 2*feats[name_0].shape[1])*torch.new_ones(edge_list_1_[1])
##so now I write connections to lin1.bias
edge_list_1 = torch.cat((edge_list_1_[1], (h + 3*feats[name_0].shape[1])*torch.new_ones(edge_list_1_[1])), 0)
edge_list_1 = torch.cat((edge_list_1_, edge_list_1), 1)
h_r[i] = h
h = h + 3*feats[names_0].shape[1]
edge_list = torch.cat((edge_list, edge_list_0, edge_list_1), 1)
###
z = torch.transpose(feats["lin1.weight"], 0, 1)
Re = nn.ReLU()
edge_list_1_ = torch.nonzero(Re(z - 0.5*torch.new_ones(z))).t()
edge_list_1_[0] = edge_list_1_[0] + h*torch.new_ones(edge_list_1_[0])
edge_list_1_[1] = edge_list_1_[1] + (h + feats["lin1.weight"].shape[1])*torch.new_ones(edge_list_1_[1])
edge_list_1 = torch.cat((edge_list_1_[1], (h + 2*feats["lin1.weight"].shape[1])*torch.new_ones(edge_list_1_[1])), 0)
edge_list_1 = torch.cat((edge_list_1_, edge_list_1), 1)
edge_list = torch.cat((edge_list, edge_list_0, edge_list_1), 1)
"""
###
feats_L = torch.load('/netp/test_1_/' + str(i_1)+"_"+'my_file.npy')
loss = feats_L['loss']
y = loss
##data=Data(x=x,edge_index=edge_list,y = y) #包装成Data类
return Data(x=x,edge_index=edge_list,y = y) #包装成Data类
class Myds(InMemoryDataset):
##url = 'https://github.com/kimiyoung/planetoid/raw/master/data'
def __init__(self, root: str, name: str):
##, split: str = "public",
##num_train_per_class: int = 20, num_val: int = 500,
##num_test: int = 1000, transform: Optional[Callable] = None,
##pre_transform: Optional[Callable] = None):
self.name = name
super().__init__(root)##, transform, pre_transform = None, pre_filter = None) #等同于super(Planetoid,self).__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])#InMemoryDataset继承于Dataset类,processed_paths是Dataset类的属性,有self.processed_dir拼接得到
###
self.data, self.slices = torch.load(self.processed_paths[0])#InMemoryDataset继承于Dataset类,processed_paths是Dataset类的属性,有self.processed_dir拼接得到
###
'''
# 将数据集划分为训练、验证和测试集
self.split = split
assert self.split in ['public', 'full', 'random']
if split == 'full':
data = self.get(0)
data.train_mask.fill_(True)
data.train_mask[data.val_mask | data.test_mask] = False
self.data, self.slices = self.collate([data])
elif split == 'random':
data = self.get(0)
data.train_mask.fill_(False)
for c in range(self.num_classes):
idx = (data.y == c).nonzero(as_tuple=False).view(-1)
idx = idx[torch.randperm(idx.size(0))[:num_train_per_class]]
data.train_mask[idx] = True
remaining = (~data.train_mask).nonzero(as_tuple=False).view(-1)
remaining = remaining[torch.randperm(remaining.size(0))]
data.val_mask.fill_(False)
data.val_mask[remaining[:num_val]] = True
data.test_mask.fill_(False)
data.test_mask[remaining[num_val:num_val + num_test]] = True
self.data, self.slices = self.collate([data])
'''
@property
def raw_dir(self) -> str:
return '/netp/test_1/'
@property
def processed_dir(self) -> str:
return osp.join(self.root, self.name, 'processed')
@property
def raw_file_names(self) -> List[str]:
##names =
##return [f'ind.{self.name.lower()}.{name}' for name in names]
return ['/netp/test_1/', 'netp/test_1_/', 'my_file.npy']
@property
def processed_file_names(self) -> str:
return 'data.pt'
def download(self):
pass
def process(self):
###
# Read data into huge `Data` list.
data_list = []
i = 0
for i in range(1,10):
data_list += read_myds_data('/netp/test_1/', 'netp/test_1_/', 'my_file.npy', i)#调用函数读取数据,包装成Data
data, slices = self.collate(data_list)
data = data if self.pre_transform is None else self.pre_transform(data)
torch.save((data, slices), self.processed_paths[0])
###
def __repr__(self) -> str:
return f'{self.name}()' And the compiler warned me in this way : ~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/data/in_memory_dataset.py in collate(data_list) ~/miniconda3/envs/myconda/lib/python3.8/site-packages/torch_geometric/data/collate.py in collate(cls, data_list, increment, add_batch, follow_batch, exclude_keys) AttributeError: 'tuple' object has no attribute 'stores_as' Thanks for your help! |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 26 replies
-
Hi @Kevin-shihello-world |
Beta Was this translation helpful? Give feedback.
-
Note that @property
def raw_dir(self) -> str:
return '/netp/test_1' |
Beta Was this translation helpful? Give feedback.
Note that
raw_dir
needs to be a string, not a set/list of strings: