Skip to content

Commit 9387412

Browse files
committed
add github and facebook datasets
1 parent 3573344 commit 9387412

File tree

8 files changed

+126
-5
lines changed

8 files changed

+126
-5
lines changed

dhg/data/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from .cocitation import CocitationCora, CocitationCiteseer, CocitationPubmed
1010
from .blogcatalog import BlogCatalog
1111
from .flickr import Flickr
12+
from .github import Github
13+
from .facebook import Facebook
1214

1315
__all__ = [
1416
"BaseData",
@@ -17,6 +19,8 @@
1719
"Pubmed",
1820
"BlogCatalog",
1921
"Flickr",
22+
"Github",
23+
"Facebook",
2024
"Cooking200",
2125
"MovieLens1M",
2226
"Yelp2018",

dhg/data/facebook.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Facebook(BaseData):
10+
r"""The Facebook dataset is a social network dataset for vertex classification task.
11+
A page-page graph of verified Facebook sites. Nodes correspond to official Facebook pages, links to mutual likes between sites.
12+
Node features are extracted from the site descriptions.
13+
More details see the `Multi-Scale Attributed Node Embedding <https://arxiv.org/pdf/1909.13021.pdf>`_ paper.
14+
15+
.. note::
16+
The L1-normalization for the feature is not recommended for this dataset.
17+
18+
The content of the Facebook dataset includes the following:
19+
20+
- ``num_classes``: The number of classes: :math:`4`.
21+
- ``num_vertices``: The number of vertices: :math:`22,470`.
22+
- ``num_edges``: The number of edges: :math:`85,501`.
23+
- ``dim_features``: The dimension of features: :math:`4,714`.
24+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(22,470\times 4,714)`.
25+
- ``edge_list``: The edge list. ``List`` with length :math:`(85,501 \times 2)`.
26+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(22,470, )`.
27+
28+
Args:
29+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
30+
"""
31+
32+
def __init__(self, data_root: Optional[str] = None) -> None:
33+
super().__init__("facebook", data_root)
34+
self._content = {
35+
"num_classes": 4,
36+
"num_vertices": 22470,
37+
"num_edges": 85501,
38+
"dim_features": 8189,
39+
"features": {
40+
"upon": [{"filename": "features.pkl", "md5": "046eec1b67fb5bf504eaad75e98af141"}],
41+
"loader": load_from_pickle,
42+
"preprocess": [to_tensor], # partial(norm_ft, ord=1)
43+
},
44+
"edge_list": {
45+
"upon": [{"filename": "edge_list.pkl", "md5": "98c6551d020c7741554cae5eab8336ef"}],
46+
"loader": load_from_pickle,
47+
},
48+
"labels": {
49+
"upon": [{"filename": "labels.pkl", "md5": "ae0c116274cedc00522df66bd921affc"}],
50+
"loader": load_from_pickle,
51+
"preprocess": [to_long_tensor],
52+
},
53+
}

dhg/data/github.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from typing import Optional
2+
from functools import partial
3+
4+
from dhg.datapipe import load_from_pickle, norm_ft, to_tensor, to_long_tensor, to_bool_tensor
5+
6+
from .base import BaseData
7+
8+
9+
class Github(BaseData):
10+
r"""The Github dataset is a collaboration network dataset for vertex classification task.
11+
Nodes correspond to developers who have starred at least 10 repositories and edges to mutual follower relationships.
12+
Node features are location, starred repositories, employer and e-mail address.
13+
The labels are binary, where denoting the web developers and machine learning developers.
14+
More details see the `Multi-Scale Attributed Node Embedding <https://arxiv.org/pdf/1909.13021.pdf>`_ paper.
15+
16+
.. note::
17+
The L1-normalization for the feature is not recommended for this dataset.
18+
19+
The content of the Github dataset includes the following:
20+
21+
- ``num_classes``: The number of classes: :math:`4`.
22+
- ``num_vertices``: The number of vertices: :math:`37,700`.
23+
- ``num_edges``: The number of edges: :math:`144,501`.
24+
- ``dim_features``: The dimension of features: :math:`4,005`.
25+
- ``features``: The vertex feature matrix. ``torch.Tensor`` with size :math:`(37,700 \times 4,005)`.
26+
- ``edge_list``: The edge list. ``List`` with length :math:`(144,501 \times 2)`.
27+
- ``labels``: The label list. ``torch.LongTensor`` with size :math:`(37,700, )`.
28+
29+
Args:
30+
``data_root`` (``str``, optional): The ``data_root`` has stored the data. If set to ``None``, this function will auto-download from server and save into the default direction ``~/.dhg/datasets/``. Defaults to ``None``.
31+
"""
32+
33+
def __init__(self, data_root: Optional[str] = None) -> None:
34+
super().__init__("github", data_root)
35+
self._content = {
36+
"num_classes": 2,
37+
"num_vertices": 37700,
38+
"num_edges": 144501,
39+
"dim_features": 4005,
40+
"features": {
41+
"upon": [{"filename": "features.pkl", "md5": "f097384b61876a22cf048d28a2193c5a"}],
42+
"loader": load_from_pickle,
43+
"preprocess": [to_tensor], # partial(norm_ft, ord=1)
44+
},
45+
"edge_list": {
46+
"upon": [{"filename": "edge_list.pkl", "md5": "57012ac55fe125d8865a693b09f794b3"}],
47+
"loader": load_from_pickle,
48+
},
49+
"labels": {
50+
"upon": [{"filename": "labels.pkl", "md5": "9b1282a2a8a23c9f3b480136055c8b6b"}],
51+
"loader": load_from_pickle,
52+
"preprocess": [to_long_tensor],
53+
},
54+
}

dhg/datapipe/common.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ def to_tensor(
2727
elif isinstance(X, scipy.sparse.csr_matrix):
2828
X = X.todense()
2929
X = torch.tensor(X)
30+
elif isinstance(X, scipy.sparse.coo_matrix):
31+
X = X.todense()
32+
X = torch.tensor(X)
3033
elif isinstance(X, np.ndarray):
3134
X = torch.tensor(X)
3235
else:

dhg/models/graphs/lightgcn.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@ class LightGCN(nn.Module):
1818
``num_items`` (``int``): The Number of items.
1919
``emb_dim`` (``int``): Embedding dimension.
2020
``num_layers`` (``int``): The Number of layers. Defaults to ``3``.
21+
``drop_rate`` (``float``): Dropout rate. Randomly dropout the connections in training stage with probability ``drop_rate``. Default: ``0.0``.
2122
"""
2223

2324
def __init__(
24-
self, num_users: int, num_items: int, emb_dim: int, num_layers: int = 3
25+
self, num_users: int, num_items: int, emb_dim: int, num_layers: int = 3, drop_rate: float = 0.0
2526
) -> None:
2627

2728
super().__init__()
2829
self.num_users, self.num_items = num_users, num_items
2930
self.num_layers = num_layers
31+
self.drop_rate = drop_rate
3032
self.u_embedding = nn.Embedding(num_users, emb_dim)
3133
self.i_embedding = nn.Embedding(num_items, emb_dim)
3234
self.reset_parameters()
@@ -43,13 +45,14 @@ def forward(self, ui_bigraph: BiGraph) -> Tuple[torch.Tensor, torch.Tensor]:
4345
Args:
4446
``ui_bigraph`` (``dhg.BiGraph``): The user-item bipartite graph.
4547
"""
48+
drop_rate = self.drop_rate if self.training else 0.0
4649
u_embs = self.u_embedding.weight
4750
i_embs = self.i_embedding.weight
4851
all_embs = torch.cat([u_embs, i_embs], dim=0)
4952

5053
embs_list = [all_embs]
5154
for _ in range(self.num_layers):
52-
all_embs = ui_bigraph.smoothing_with_GCN(all_embs)
55+
all_embs = ui_bigraph.smoothing_with_GCN(all_embs, drop_rate=drop_rate)
5356
embs_list.append(all_embs)
5457
embs = torch.stack(embs_list, dim=1)
5558
embs = torch.mean(embs, dim=1)

dhg/structure/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def __init__(
4545
extra_selfloop: bool = False,
4646
device: torch.device = torch.device("cpu"),
4747
):
48+
assert isinstance(num_v, int) and num_v > 0, "num_v should be a positive integer"
4849
self.clear()
4950
self.device = device
5051
self._num_v = num_v
@@ -374,6 +375,7 @@ def __init__(
374375
v_weight: Optional[List[float]] = None,
375376
device: torch.device = torch.device("cpu"),
376377
):
378+
assert isinstance(num_v, int) and num_v > 0, "num_v should be a positive integer"
377379
self.clear()
378380
self._num_v = num_v
379381
self.device = device

docs/source/api/data.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ Graph Datasets
2121
dhg.data.Citeseer
2222
dhg.data.BlogCatalog
2323
dhg.data.Flickr
24+
dhg.data.Github
25+
dhg.data.Facebook
2426

2527

2628
Hypergraph Datasets

examples/ui_recommender/lightgcn_gowalla.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,12 @@ def forward(self, emb_users, emb_items, users, pos_items, neg_items, raw_emb_use
3535

3636
def train(net, data_loader, optimizer, criterion, epoch):
3737
net.train()
38-
dropped_ui_bigraph = ui_bigraph.drop_edges(0.2)
38+
3939
loss_mean, st = 0, time.time()
4040
for users, pos_items, neg_items in data_loader:
4141
users, pos_items, neg_items = users.to(device), pos_items.to(device), neg_items.to(device)
4242
optimizer.zero_grad()
43-
emb_users, emb_items = net(dropped_ui_bigraph)
43+
emb_users, emb_items = net(ui_bigraph)
4444
loss = criterion(
4545
emb_users, emb_items, users, pos_items, neg_items, net.u_embedding.weight, net.i_embedding.weight,
4646
)
@@ -88,7 +88,7 @@ def test(net, data_loader):
8888

8989
if __name__ == "__main__":
9090
# from dhg.utils import simple_stdout2file
91-
# simple_stdout2file("/home/fengyifan/lightgcn_gowalla.log")
91+
# simple_stdout2file("/home/fengyifan/lightgcn_gowalla_drop.log")
9292
dim_emb = 64
9393
lr = 0.001
9494
num_workers = 0

0 commit comments

Comments
 (0)